In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer

# Read data

In [2]:
!ls data/

Documents.csv.xls          relevance_train.csv.xls
queries.csv.xls            submission_example.csv.xls
relevance_test.csv.xls


In [3]:
relevance_train = pd.read_csv('data/relevance_train.csv.xls', sep='\t', index_col=False)
relevance_test = pd.read_csv('data/relevance_test.csv.xls', sep=',', index_col=False)

In [4]:
relevance_test.head(2)

Unnamed: 0,QueryId,DocumentId
0,126,974
1,126,1326


In [5]:
relevance_train.head(2) 

Unnamed: 0,QueryId,DocumentId,Relevance
0,1,184,2
1,1,29,2


In [6]:
queries = pd.read_csv('data/queries.csv.xls', sep='\t', index_col=False)

In [7]:
print(len(queries))
queries.head(2)

226


Unnamed: 0,QueryId,Query
0,1.0,what similarity laws must be obeyed when const...
1,2.0,what are the structural and aeroelastic proble...


In [8]:
queries = queries.dropna()
len(queries)

225

In [9]:
with open('data/Documents.csv.xls') as file:
    documents_data = file.read()

In [10]:
raw_documents = documents_data.split('.Id ')
len(raw_documents)

1482

In [11]:
id_to_doc = {}
for d in raw_documents:
    try:
        index = int(d.split('\n')[0])
        text = d.split('.W')[-1]
        id_to_doc[index] = text    
    except:
        pass

In [12]:
len(id_to_doc), id_to_doc[1]

(1400,
 '\nexperimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling effects was made for\nthe specific configuration of the experiment .\n')

# TF-IDF for documents + queries list

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #words and pairs
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
docs_for_vectorzer_list = list(id_to_doc.values()) + list(queries['Query'])
len(docs_for_vectorzer_list)

1625

In [15]:
vectorizer.fit(docs_for_vectorzer_list)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

# Rank it

$$\textbf{RANK_SCORE}(d,q) = \frac{\left(\text{tf_idf}(d),\text{tf_idf}(q)\right)}{||\text{tf_idf}(d)||\cdot||\text{tf_idf}(q)||}$$
$$\textbf{ranked}(q) = \text{sorted}_d(\textbf{RANK_SCORE}(d,q))$$

In [16]:
id_to_query = {}
for i,qid in enumerate(queries['QueryId']):
    id_to_query[int(qid)] = queries['Query'][i]

In [17]:
id_to_query[1]

'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft . '

In [18]:
def tfidf_cosine_similarity(doc_id, query_id):
    if int(doc_id) in id_to_doc.keys() and int(query_id) in id_to_query.keys():
        doc, query = id_to_doc[int(doc_id)], id_to_query[int(query_id)]
        
        doc_enc = vectorizer.transform([doc]).todense()
        doc_enc = np.ravel(doc_enc)

        query_enc = vectorizer.transform([query]).todense()
        query_enc = np.ravel(query_enc[0])
        try:
            return -(cosine(doc_enc, query_enc) - 1)
        except:
            pass
    return np.nan

In [19]:
tfidf_cosine_similarity(1,1) , tfidf_cosine_similarity(1,2)

(0.0037532024443338585, 0.01280849839869902)

In [20]:
%%time
# relevance_train['score'] = [
#     tfidf_cosine_similarity(d, q) 
#     for d,q in zip(relevance_train['DocumentId'], relevance_train['QueryId'])] 
relevance_test['score'] = [
    tfidf_cosine_similarity(d, q) 
    for d,q in zip(relevance_test['DocumentId'], relevance_test['QueryId'])
]

# fill nan values with some random pair score
nan_pos = np.isnan(relevance_test['score'])
relevance_test.loc[nan_pos,'score'] = np.random.choice(relevance_test['score'].dropna(), size = np.sum(nan_pos))

CPU times: user 1.32 s, sys: 23.5 ms, total: 1.34 s
Wall time: 1.36 s


In [21]:
relevance_test.head(3)

Unnamed: 0,QueryId,DocumentId,score
0,126,974,0.006894
1,126,1326,0.002019
2,126,187,0.003095


In [22]:
test_sorted = relevance_test.sort_values(by='score',ascending=False)

In [23]:
test_sorted.head(4)

Unnamed: 0,QueryId,DocumentId,score
159,149,1131,0.1579
430,186,1309,0.137144
155,149,934,0.137144
153,149,930,0.121118


In [24]:
output_test = test_sorted[["QueryId", "DocumentId"]]
output_test.to_csv("out1.csv", sep=',', index=False)

### kaggle private score 0.51045