# Project 1 Task 1

### Contents
- Imports and flags
- Data
- Tokenization
- Vectorization
- Search with query

### Imports and flags

In [1]:
from nltk.stem import PorterStemmer
import nltk
from src.tokenization import *
from src.vectorization import *

In [2]:
F_reduced_dataset = False  # If true load only 1% of corpus and small portion of queries and train_set
F_do_tokenization = False  # If true tokenize corpus + queries, else load already tokenized documents

### Data

In [3]:
# Load
corpus, queries, train_set = load_task1_data(F_do_tokenization)

In [4]:
corpus.head()

Unnamed: 0_level_0,text
_id,Unnamed: 1_level_1
1867825,"After the invention of the cotton gin, cotton ..."
419610,"Timer has separate night and day outlets, whic..."
4614226,The rose-buying public still encounters a wide...
4108603,Map of Wendover (Aut) Airport. A detailed map ...
3744854,And as the poems Reapers and Cotton Song indic...


In [5]:
queries.head()

Unnamed: 0_level_0,text
_id,Unnamed: 1_level_1
1185869,)what was the immediate impact of the success ...
1185868,_________ justice is designed to repair the ha...
597651,what color is amber urine
403613,is autoimmune hepatitis a bile acid synthesis ...
1183785,elegxo meaning


In [6]:
train_set.head()

Unnamed: 0_level_0,corpus-id
query-id,Unnamed: 1_level_1
1185869,0
1185868,16
597651,49
403613,60
1183785,389


### Tokenization

In [7]:
nltk.download('stopwords')
stemmer = PorterStemmer()
token_dir = "Data/tokens/"

# Tokenize corpus and queries and save it, OR load it
if F_do_tokenization:
    tokenize_corpus_queries(corpus, queries, stemmer)
    save_tokenized_corpus_queries(token_dir, corpus, queries)
else:
    load_tokenized_corpus_queries(token_dir, corpus, queries)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aducret/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Files loaded


### Vocabulary

In [8]:
# Generate vocab from corpus
vocab_path = f"{token_dir}vocabulary.txt"
reduced_corpus = corpus.iloc[0:5]
vocab = generate_vocab_from_corpus(reduced_corpus)
save_vocab(vocab,vocab_path)
vocab

Unnamed: 0,word,method
0,10,Both
1,12,Custom
2,15,Both
3,1500,Custom
4,1790,Both
...,...,...
175,tools,TFID
176,underscored,TFID
177,understanding,TFID
178,variety,TFID


### 3. Vectorization

In [9]:
vectorized_corpus, vectorized_queries = vectorize_corpus_queries(corpus, queries)

### 4. Search with query

In [10]:
train_set

Unnamed: 0_level_0,corpus-id
query-id,Unnamed: 1_level_1
1185869,0
1185868,16
597651,49
403613,60
1183785,389
...,...
19285,8841362
558837,4989159
559149,8841547
706678,8841643


In [11]:
from sklearn.metrics.pairwise import linear_kernel
from operator import itemgetter

# Retrieval oracle 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(corpus.text.to_list()[:10000])
npm_tfidf = features.todense()

# Return all document ids that that have cosine similarity with the query larger than a threshold
def search_vec_sklearn(query, features, threshold=0.1):
    new_features = tf.transform([query])
    cosine_similarities = linear_kernel(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
        if cos_sim < threshold:
            break
        doc_ids.append(related_docs_indices[i])
    return doc_ids

In [12]:
search_vec_sklearn(queries.iloc[0]["text"], features, threshold=0.1)[0]
k_search(vectorized_queries[0], vectorized_corpus, corpus.index, 1)

[7624917]

In [16]:

# Find every relevant doc for 10 first queries
for (q_id, row), query_vector in zip(train_set.iterrows(), vectorized_queries[:10]):
    docs_id = k_search(query_vector, vectorized_corpus, corpus.index, 1)
    train_set.at[q_id, "custom_id"] = int(docs_id[0])
    

In [17]:
train_set

Unnamed: 0_level_0,corpus-id,custom_id
query-id,Unnamed: 1_level_1,Unnamed: 2_level_1
1185869,0,7624917.0
1185868,16,3020376.0
597651,49,4944584.0
403613,60,5995440.0
1183785,389,389.0
...,...,...
19285,8841362,
558837,4989159,
559149,8841547,
706678,8841643,
