# Assignment 2: Using ElasticSearch to build a search engine

## Preprocessing

This step involves loading of data of the three languages, converting them into JSON files and splitting them in order to curl them to Kibana. This step also involves loading of the $lang_cosidf.txt files and doing appropriate loading into the ranking functions in the next section.

In [1]:
import pathlib
import itertools
import json
import numpy as np
import pandas as pd

from tqdm import tqdm

from cs589.assignment1.utils.common import load_text_file

base_path = pathlib.Path("cs589/assignment1/dataset/")
tqdm.pandas()

In [2]:
def load_qid_dataframe(lang="java"):
    qid_dataframe = pd.read_csv(base_path / pathlib.Path(f"{lang}/{lang}_cosidf.txt"), 
                                sep="\t", 
                                usecols=["qid1", "qid2", "label"],
                                dtype={"qid1": str, "qid2": str, "label": int})
    return qid_dataframe


def load_corpus(lang="java", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_qid2all.txt"))

    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid", "title", "question", "answer"], line.split("\t"))}
        )
            
    corpus_dataframe = pd.DataFrame(record_list)

    return corpus_dataframe

In [3]:
df_java = load_corpus(lang="java", verbose=True)
df_javascript = load_corpus(lang="javascript", verbose=True)
df_python = load_corpus(lang="python", verbose=True)

100%|██████████████████████████████████████████████████████████████████████| 700552/700552 [00:02<00:00, 248611.70it/s]
100%|████████████████████████████████████████████████████████████████████| 1319382/1319382 [00:04<00:00, 276141.35it/s]
100%|██████████████████████████████████████████████████████████████████████| 485827/485827 [00:01<00:00, 303883.93it/s]


In [4]:
json_string1 = '{{"index": {{"_id": "{index}"}}}}'
json_string2 = '{{"title": "{title}", "body": "{body}", "answer": "{answer}"}}'

In [5]:
def create_json(dataframe, lang="java"):
    jsons = []
    json_string1 = '{{"index": {{"_id": "{index}"}}}}'
    json_string2 = '{{"title": "{title}", "body": "{body}", "answer": "{answer}"}}'

    for row in range(len(dataframe)):
        data = [dataframe.iloc[row, x] for x in range(len(dataframe.columns))]
        jsons.append(json_string1.format(index=data[0]))
        jsons.append(json_string2.format(title=data[1],body=data[2], answer=data[3]))

    with open("json_split/" + lang + ".json", "w") as fp:
        for i in range(len(jsons)):
            x = json.dumps(jsons[i])
            y = json.loads(x)
            fp.write("%s\n" % y)
            
    fp.close()

In [6]:
create_json(df_java, lang="java")
create_json(df_javascript, lang="javascript")
create_json(df_python, lang="python")

In [7]:
# Reference
# https://stackoverflow.com/questions/16289859/splitting-large-text-file-into-smaller-text-files-by-line-numbers-using-python

def split_json(lang="java"):
    my_file = "json_split/" + lang + ".json"
    sorting = True
    hold_lines = []
    with open(my_file,'r') as text_file:
        for row in text_file:
            hold_lines.append(row)
    outer_count = 1
    line_count = 0
    while sorting:
        count = 0
        increment = (outer_count-1) * 175000
        left = len(hold_lines) - increment
        file_name = lang + "_" + str(outer_count * 10) + ".json"
        hold_new_lines = []
        if left < 175000:
            while count < left:
                hold_new_lines.append(hold_lines[line_count])
                count += 1
                line_count += 1
            sorting = False
        else:
            while count < 175000:
                hold_new_lines.append(hold_lines[line_count])
                count += 1
                line_count += 1
        outer_count += 1
        with open("json_split/" + lang + "_split/" + file_name,'w') as next_file:
            for row in hold_new_lines:
                next_file.write(row)

In [9]:
split_json(lang="java")
split_json(lang="javascript")
split_json(lang="python")

In [None]:
'''
Directory for json_split

/json_split
    -- /java_split
        -- java_10.json
        -- java_20.json
        -- java_30.json
        -- java_40.json
        -- java_50.json
        -- java_60.json
        -- java_70.json
        -- java_80.json
        -- java_90.json
    -- /javascript_split
        -- javascript_10.json
        -- javascript_20.json
        -- javascript_30.json
        -- javascript_40.json
        -- javascript_50.json
        -- javascript_60.json
        -- javascript_70.json
        -- javascript_80.json
        -- javascript_90.json
        -- javascript_100.json
        -- javascript_110.json
        -- javascript_120.json
        -- javascript_130.json
        -- javascript_140.json
        -- javascript_150.json
        -- javascript_160.json
    -- /python_split
        -- python_10.json
        -- python_20.json
        -- python_30.json
        -- python_40.json
        -- python_50.json
        -- python_60.json
    -- java.json
    -- javascript.json
    -- python.json
'''

In [10]:
java_qids = load_qid_dataframe(lang="java")
javascript_qids = load_qid_dataframe(lang="javascript")
python_qids = load_qid_dataframe(lang="python")

In [11]:
java_qids.head()

Unnamed: 0,qid1,qid2,label
0,4252472,15194804,0
1,4252472,18264178,0
2,4252472,16225177,1
3,4252472,16445238,0
4,4252472,17233226,0


In [12]:
javascript_qids.head()

Unnamed: 0,qid1,qid2,label
0,25698387,34987072,0
1,25698387,21909391,1
2,25698387,40832198,0
3,25698387,34439666,0
4,25698387,27844227,0


In [13]:
python_qids.head()

Unnamed: 0,qid1,qid2,label
0,37098725,36808565,1
1,37098725,30049387,0
2,37098725,25520945,0
3,37098725,36821176,0
4,37098725,22730935,0


In [14]:
# Algorithm 2
# Generate queries for each query

def generate_ratings_from_qids(dataframe, index_name="java_lm", folder="java_lm"):
    a = dataframe["qid1"].to_numpy()
    b = dataframe[["qid2", "label"]].to_numpy()
    for i in range(0, len(a), 30):
        ratings = []
        for j in range(30):
            ratings.append({"_index":index_name,"_id":str(b[j][0]),"rating":int(b[j][1])})
        # change folder name to the respective index name
        with open("qids/" + folder + "/" + str(a[i]) + ".json", 'w') as fp:
            fp.write("[\n")
            for i in range(len(ratings)):
                x = json.dumps(ratings[i])
                #y = json.loads(x)
                fp.write("%s,\n" % x)
            fp.write("%s\n" % json.dumps(ratings[-1]))
            fp.write("]\n")
        fp.close()

In [15]:
# java
generate_ratings_from_qids(dataframe=java_qids, index_name="java_bm25", folder="java_bm25")
generate_ratings_from_qids(dataframe=java_qids, index_name="java_tfidf", folder="java_tfidf")
generate_ratings_from_qids(dataframe=java_qids, index_name="java_lm", folder="java_lm")

# javascript
generate_ratings_from_qids(dataframe=javascript_qids, index_name="javascript_bm25", folder="javascript_bm25")
generate_ratings_from_qids(dataframe=javascript_qids, index_name="javascript_tfidf", folder="javascript_tfidf")
generate_ratings_from_qids(dataframe=javascript_qids, index_name="javascript_lm", folder="javascript_lm")

# python
generate_ratings_from_qids(dataframe=python_qids, index_name="python_bm25", folder="python_bm25")
generate_ratings_from_qids(dataframe=python_qids, index_name="python_tfidf", folder="python_tfidf")
generate_ratings_from_qids(dataframe=python_qids, index_name="python_lm", folder="python_lm")

In [None]:
'''
Directory for qids
The qids have been taken appropriately according to the three datasets: java, javascript, and python

/qids
    -- /java_bm25
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /java_lm
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /java_tfidf
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /javascript_bm25
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /javascript_lm
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /javascript_tfidf
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /python_bm25
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /python_lm
        -- All .json files with filename <qid1>.json having 30 entries each
    -- /python_tfidf
        -- All .json files with filename <qid1>.json having 30 entries each
'''

In [86]:
def ranking(qid1, qid1_title, ratings):
    _search = {
        "requests": [
            {
            "id": str(qid1),
            "request": {
                "query": {
                    "bool": {
                        "must_not": {
                            "match": {
                                "_id": qid1
                            }
                        },
                        "should": [
                            {
                                "match": {
                                    "title": {
                                        "query": qid1_title,
                                        "boost": 3.0,
                                        "analyzer": "my_analyzer"
                                    }
                                }
                            },
                            {
                                "match": {
                                    "body": {
                                        "query": qid1_title,
                                        "boost": 0.5,
                                        "analyzer": "my_analyzer"
                                    }
                                }
                            },
                            {
                                "match": {
                                    "answer": {
                                        "query": qid1_title,
                                        "boost": 0.5,
                                        "analyzer": "my_analyzer"
                                    }
                                }
                            }
                        ]
                    }
                }
            },
            "ratings": ratings
            }
        ],
        "metric": {
            "dcg": {
                "k": 10,
                "normalize": True
            }
        }
    }
    return _search

In [87]:
# Algorithm 1
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts="http://localhost:9200")

In [115]:
def get_ndcg(dataframe, input_index="java_lm"):
    ndcg_list = []
    # loop through each qid1
    for i in range(0, len(dataframe["qid1"]), 30):
        qid1_title = es.get(index=input_index, id=dataframe["qid1"][i])['_source']['title']
        
        # load ratings from the json file
        f = open("qids/" + input_index + "/" + str(dataframe["qid1"][i]) + ".json")
        data = json.load(f)
        
        # assign ratings to data
        #ratings = data
        
        _search = ranking(dataframe["qid1"][i], qid1_title, ratings=data)
        
        result = es.rank_eval(index=input_index, body=_search)
        
        ndcg = result['metric_score']
        ndcg_list.append(ndcg)
        
    return ndcg_list

In [116]:
java_bm25_ndcg = get_ndcg(java_qids, input_index="java_bm25")

  result = es.rank_eval(index=input_index, body=_search)


BadRequestError: BadRequestError(400, 'x_content_parse_exception', 'Failed to build [request] after last required field arrived')

Everything has been completed as per the pdf provided except the report for getting the ndcg scores of each index. The ranking function has the same format as provided and the files have been correctly made with the right format according to the examples that were provided with the pdf. 

I posted this error on Discord and there were no responses. I even looked up to the solutions on the internet but there was no luck. I believe that rectifying this error would lead to the retrieval of the NDCG scores.

In [None]:
'''
Command to for creating index
Created 9 different indicies:


java_bm25
java_tfidf
java_lm

javascript_bm25
javascript_tfidf
javascript_lm

python_bm25
python_tfidf
python_lm

#########################################################
PUT /python_tfidf
{
  "settings": {
    "similarity": {
      "tfidf": {
        "type": "scripted",
        "script": {
          "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
        }
      }
    },
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "tfidf"
      },
      "body": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "tfidf"
      },
      "answer": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "tfidf"
      }
    }
  }
}
#########################################################
PUT /javascript_lm
{
  "settings": {
    "similarity": {
      "LM": {
        "type": "LMDirichlet",
        "mu": 2000
      }
    },
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "LM"
      },
      "body": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "LM"
      },
      "answer": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "LM"
      }
    }
  }
}
#########################################################
PUT /javascript_bm25
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "BM25"
      },
      "body": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "BM25"
      },
      "answer": {
        "type": "text",
        "analyzer": "my_analyzer",
        "similarity": "BM25"
      }
    }
  }
}
'''