# 0. Data Loading

In [1]:
import sys
import pandas as pd
import numpy as np
import pickle

if ".." not in sys.path:
    sys.path.insert(0, "..")


from DataHandling.db_reader import Reader
from DataHandling.train_data import load_data
from Ranker.bmtf import BM25
from Ranker.tfidf import TfIdf
from Ranker.pwsvm import RankSVM
from Ranker.feature_extractor import Features
from searching import searcher

In [2]:
r = Reader()
titles = r.get_titles()
bodies = r.get_bodies()
urls = r.get_urls()

df = pd.DataFrame({'title': titles, 'body': bodies, 'url': urls})
df.head()

Unnamed: 0,title,body,url
0,Home | University of Tübingen,Skip to main navigation Skip to content Skip t...,https://uni-tuebingen.de/en/
1,Uni A-Z | University of Tübingen,Skip to main navigation Skip to content Skip t...,https://uni-tuebingen.de/en/uni-a-z/
2,Contact | University of Tübingen,Skip to main navigation Skip to content Skip t...,https://uni-tuebingen.de/en/contact/
3,Excellence Strategy | University of Tübingen,Skip to main navigation Skip to content Skip t...,https://uni-tuebingen.de/en/excellence-strategy/
4,Prospective students | University of Tübingen,Skip to main navigation Skip to content Skip t...,https://uni-tuebingen.de/en/study/prospective-...


# 1. BM25

In [3]:
bm25 = BM25()
scores = bm25.get_scores(query='food', docs=df['body'])
print(df['url'][np.argsort(scores)[::-1][:10]].values)

['https://fit.uni-tuebingen.de/Project/Details?id=10364'
 'https://www.my-stuwe.de/en/refectory/allergens/'
 'http://cyber-valley.de/en/events/aixia-ai-conference-2020'
 'https://www.my-stuwe.de/en/refectory/'
 'https://uni-tuebingen.de/en/faculties/faculty-of-science/departments/psychology/research-groups/clinical-psychology-and-psychotherapy/outpatient-clinic/'
 'https://uni-tuebingen.de/en/research/centers-and-institutes/international-center-for-ethics-in-the-sciences-and-humanities/research/nature-and-sustainable-development/bioeconomy/'
 'https://uni-tuebingen.de/en/research/centers-and-institutes/international-center-for-ethics-in-the-sciences-and-humanities/teaching/studium-oecologicum/'
 'http://cyber-valley.de/en/news/cyber-valley-adds-five-shooting-stars-to-its-start-up-network'
 'https://uni-tuebingen.de/en/university/news-and-publications/press-releases/press-releases/article/ravens-were-attracted-to-humans-food-more-than-30000-years-ago/'
 'https://uni-tuebingen.de/en/rese

# 2. TF-IDF (Vector-Space-Model)

In [4]:
tfidf = TfIdf(corpus=df['body'])
scores = tfidf.get_scores(query='food', docs=df['body'])
print(df['url'][np.argsort(scores)[::-1][:10]].values)

['https://fit.uni-tuebingen.de/Project/Details?id=10364'
 'https://www.my-stuwe.de/en/refectory/allergens/'
 'http://cyber-valley.de/en/events/aixia-ai-conference-2020'
 'https://uni-tuebingen.de/en/research/centers-and-institutes/international-center-for-ethics-in-the-sciences-and-humanities/research/nature-and-sustainable-development/bioeconomy/'
 'https://www.my-stuwe.de/en/refectory/guidelines/'
 'https://uni-tuebingen.de/en/university/news-and-publications/press-releases/press-releases/article/ravens-were-attracted-to-humans-food-more-than-30000-years-ago/'
 'http://cyber-valley.de/en/news/cyber-valley-adds-five-shooting-stars-to-its-start-up-network'
 'https://www.my-stuwe.de/en/refectory/'
 'https://www.my-stuwe.de/en/refectory/cafeteria-unibibliothek-tuebingen/'
 'https://uni-tuebingen.de/en/faculties/faculty-of-science/departments/psychology/research-groups/clinical-psychology-and-psychotherapy/outpatient-clinic/']


In [5]:
features = Features(query='europe', url=df['url'], title=df['title'], body=df['body'])
features.get_features().describe()

Unnamed: 0,url_bm25,url_idf,url_vsm,title_bm25,title_idf,title_vsm,body_bm25,body_idf,body_vsm
count,5162.0,5162.0,5162.0,5162.0,5162.0,5162.0,5162.0,5162.0,5162.0
mean,0.0,7.757514,0.00031,0.0,7.469832,0.00046,0.001091,3.03308,0.002949
std,0.0,0.0,0.010104,0.0,2.664793e-15,0.012717,0.078367,8.882645e-16,0.011282
min,0.0,7.757514,0.0,0.0,7.469832,0.0,0.0,3.03308,0.0
25%,0.0,7.757514,0.0,0.0,7.469832,0.0,0.0,3.03308,0.0
50%,0.0,7.757514,0.0,0.0,7.469832,0.0,0.0,3.03308,0.0
75%,0.0,7.757514,0.0,0.0,7.469832,0.0,0.0,3.03308,0.0
max,0.0,7.757514,0.382406,0.0,7.469832,0.469613,5.630409,3.03308,0.214705


#

# Model Training

In [6]:
dic = {'url_bm25': 108, 'url_idf': 18, 'url_vsm': 103,
       'url_covered_query_term_number': 3, 'url_query_term_ratio': 8, 'url_stream_length': 13, 'url_n_slash':125, 'url_len_url': 126,
       'title_bm25': 107, 'title_idf': 17, 'title_vsm': 102, 
       'title_covered_query_term_number': 2, 'title_query_term_ratio': 7, 'title_stream_length': 12,
       'body_bm25': 105, 'body_idf': 15, 'body_vsm': 100, 'body_covered_query_term_number': 0, 'body_query_term_ratio': 5, 'body_stream_length': 10}

In [7]:
X_train, y_train = load_data(path='../../../data/MSLR-WEB10K/Fold1/train.txt', feature_indices=list(dic.values()), nrows=1000)

In [20]:
import pickle
model = RankSVM()
model.fit(X_train, y_train)
pickle.dump(model, open('../models/svm.pkl', 'wb'))




In [4]:
searcher(query='europe', df=df, ranker_str='pwsvm')

FileNotFoundError: [Errno 2] No such file or directory: 'models/svm.pkl'

In [None]:
scores = model.get_scores(X_train)

In [13]:
np.unique(scores, return_counts=True)

(array([0]), array([1000]))

# 3. Pairwise SVM

# 4. Neural Network