# SI650 HW2
### kaggle username: dershanchen (Andy Chen)

In [None]:
import os
import json
import numpy as np
import pandas as pd

## Loading Datasets

In [None]:
!pip install kaggle

In [None]:
# !mkdir /home/andy/.kaggle
token = {"username":"dershanchen", "key":"YOUR_USER_KEY"}
with open('kaggle.json', 'w') as file:
    json.dump(token, file)

!cp kaggle.json /home/andy/.kaggle/kaggle.json  # move to root

!chmod 600 /home/andy/.kaggle/kaggle.json  # give user full read and write permission

In [None]:
!kaggle competitions download -c si-650eecs-549-ranker

In [None]:
!kaggle competitions download -c si-650eecs-549-rankergaming

In [None]:
!kaggle competitions download -c si-650eecs-549-rankerandroid

In [None]:
!unzip si-650eecs-549-ranker.zip
!unzip si-650eecs-549-rankergaming.zip
!unzip si-650eecs-549-rankerandroid.zip

In [None]:
trec_covid_d = pd.read_csv("documents.csv")
trec_covid_q = pd.read_csv("query.csv")
gaming_d = pd.read_csv("documents_gaming.csv")
gaming_q = pd.read_csv("query_gaming.csv")
android_d = pd.read_csv("documents_android.csv")
android_q = pd.read_csv("query_android.csv")

## Document Indexing

In [None]:
# https://github.com/castorini/pyserini/blob/master/docs/installation.md
# !conda create --yes -n si650
# !conda activate si650
# !conda install -c conda-forge openjdk=11
# !pip install pyserini
# !pip install transformers==4.6.0
# !pip install onnxruntime
# !conda install -c conda-forge pyjnius

In [None]:
trec_covid_d.rename(columns={'DocumentId':'id'}, inplace=True)
gaming_d.rename(columns={'DocumentId':'id'}, inplace=True)
android_d.rename(columns={'DocumentId':'id'}, inplace=True)

In [None]:
# add additional weight to title
trec_covid_d['contents'] = trec_covid_d['Title'] + ' ' + trec_covid_d['Title'] + ' ' + trec_covid_d['Document Description']
gaming_d['contents'] = gaming_d['Document Title'] + ' ' + gaming_d['Document Title'] + ' ' + gaming_d['Document Description']
android_d['contents'] = android_d['Document Title'] + ' ' + android_d['Document Title'] + ' ' + android_d['Document Description']

In [None]:
# !mkdir trec_covid_json
# !mkdir gaming_json
# !mkdir android_json
# !mkdir indexes

In [None]:
trec_covid_json = trec_covid_d[['id', 'contents']].to_json(orient='records')
trec_covid_parsed = json.loads(trec_covid_json)
with open('trec_covid_json/trec_covid.json', 'w') as f:
    for line in trec_covid_parsed:
        json.dump(line, f, indent=2)
        f.write('\n')

gaming_json = gaming_d[['id', 'contents']].to_json(orient='records')
gaming_parsed = json.loads(gaming_json)
with open('gaming_json/gaming.json', 'w') as f:
    for line in gaming_parsed:
        json.dump(line, f, indent=2)
        f.write('\n')

android_json = android_d[['id', 'contents']].to_json(orient='records')
android_parsed = json.loads(android_json)
with open('android_json/android.json', 'w') as f:
    for line in android_parsed:
        json.dump(line, f, indent=2)
        f.write('\n')

In [None]:
!python3 -m pyserini.index -collection JsonCollection \
                         -generator DefaultLuceneDocumentGenerator \
                         -threads 1 \
                         -input trec_covid_json \
                         -index indexes/trec_covid \
                         -storeDocvectors

In [None]:
!python3 -m pyserini.index -collection JsonCollection \
                         -generator DefaultLuceneDocumentGenerator \
                         -threads 1 \
                         -input gaming_json \
                         -index indexes/gaming \
                         -storeDocvectors

In [None]:
!python3 -m pyserini.index -collection JsonCollection \
                         -generator DefaultLuceneDocumentGenerator \
                         -threads 1 \
                         -input android_json \
                         -index indexes/android \
                         -storeDocvectors

## Testing Custom Ranker

In [None]:
from pyserini.index import IndexReader
index_reader = IndexReader("indexes/android")

I wanted to put three aspects into consideration in the custom ranker:
1. term frequency, query term frequency, inverse document frequency as defined in other rankers
2. term importance in collection (collection frequency)
3. term position in query and document

I decided to split the weighting measures into two main components, let's call them individual importance and collection importance. Paired importance includes tf, qtf, term position in query, and term position in document. These metrics target an individual queries or documents. Collection importance, on the other hand, includes idf and cf. Then, these two components are interpolated into a new model based on a lambda parameter.

Additionally, I added smoothing throughout the function, but it definitely still needs a lot of tuning. The term position metrics could also be experimented further. Currently, I gave more importance to the terms at the end of queries (queries usually start with interrogative words, which aren't that useful) and those at the beginning of documents (I concatted the titles in front of the descriptions in the indexing process, so I assumed that titles have more importance). That being said, by utilizing interpolation and adding more features into consideration, I believe that this ranker could be more balanced than the other two ranking functions.

In [None]:
def score(index_reader, query, doc_id, lmd=0.6, sm=0.2):  # lmd: lambda, sm: smoothing
    '''
    Scores the relevance of the document for the provided query using a
    custom ranking method. Query is a tokenized list of query terms and doc_id
    is a numeric identifier of which document in the index should be scored
    for this query.
    '''
    rank_score = 0
    
    n_terms = index_reader.stats()["total_terms"]
    n_docs = index_reader.stats()["documents"]
    doc_vector = index_reader.get_document_vector(doc_id)
    doc_positions = index_reader.get_term_positions(doc_id)

    doc_query_set = set(query).intersection(doc_vector.keys())
    doc_length = sum(doc_vector.values())
    query_length = len(query)
    query_position = {k: v for v, k in enumerate(query)}

    for term in doc_query_set:
        df, cf = index_reader.get_term_counts(term)

        qtf = query.count(term)
        tf = doc_vector[term]
        
        # term relative position
        trp_q = query_position[term] / query_length  # give importance to end of query
        trp_d = np.log(np.log(doc_length + 1) / np.log(np.mean(doc_positions[term]) + 1))  # give importance to start of document
        
        # collection importance
        ci = (cf / n_terms) * (n_docs / df)
        idf = np.log(n_docs / df)

        term_score = lmd * (np.log(tf) / (qtf * trp_q * trp_d + sm)) + (1 - lmd) * (ci * idf + sm)

        rank_score += term_score

    return rank_score

## Ranking Documents

In [None]:
!python main.py plnr indexes/trec_covid query.csv

In [None]:
!kaggle competitions submit -c si-650eecs-549-ranker -f ranking_plnr.txt -m ""

In [None]:
!python main.py bm25 indexes/gaming query_gaming.csv

In [None]:
!kaggle competitions submit -c si-650eecs-549-rankergaming -f ranking_bm25.txt -m ""

In [None]:
!python main.py custom indexes/android query_android.csv

In [None]:
!kaggle competitions submit -c si-650eecs-549-rankerandroid -f ranking_custom.txt -m ""