# Project 1 Task 1

### Contents
1. Imports and flags
2. Tokenization
3. Vectorization
4. Search with query

### 1. Imports and flags

#### 1.1 Librairies

In [2]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
import numpy as np
import pickle
import os
from operator import itemgetter
from collections import Counter, defaultdict
nltk.download('stopwords')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /home/atappy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### 1.2 Flags <span style="color:red"> !!! CHECK BEFORE RUNING EACH CELLS !!! </span>

In [21]:
F_reduced_dataset = True  # If true load only 1% of corpus and small portion of queries and train_set
F_do_tokenization = True # If true tokenize corpus + queries, else load already tokenized documents

#### 1.3 Data

In [29]:
corpus = pd.read_json('Data/corpus.jsonl', lines=True).set_index(['_id'])
queries = pd.read_json('Data/queries.jsonl', lines=True)[['_id', 'text']].set_index(['_id'])
train_set = pd.read_table("Data/task1_train.tsv")[["query-id","corpus-id"]].set_index(['query-id'])

if F_reduced_dataset:
    corpus = corpus.head(15000)
    queries = queries.head(15000)
    train_set = train_set.head(5000)


In [23]:
# Display data
print("corpus", "\n", corpus.head(3), "\n")
print("queries", "\n", queries.head(3), "\n")
print("train_set", "\n", train_set.head(3) )


corpus 
                                                       text
_id                                                       
1867825  After the invention of the cotton gin, cotton ...
419610   Timer has separate night and day outlets, whic...
4614226  The rose-buying public still encounters a wide... 

queries 
                                                       text
_id                                                       
1185869  )what was the immediate impact of the success ...
1185868  _________ justice is designed to repair the ha...
597651                           what color is amber urine 

train_set 
           corpus-id
query-id           
1185869           0
1185868          16
597651           49


### 2. Tokenization

#### 2.1 Tokenization tools

In [24]:
def remove_punctuation(text):
    """
    Remove any punctuation charachter from a text (str)
    """
    return "".join([ch for ch in text if ch not in string.punctuation])

def tokenize(text):
    """
    Transform a text (str) in a list of stemmed words (list[str])
    """
    tokens = nltk.word_tokenize(text)
    tokens = [remove_punctuation(token) for token in tokens]
    tokens = [stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words('english') and len(word)>1]
    return tokens

#### 2.2 Tokenize corpus 

In [30]:
# Tokenize corpus and queries
if F_do_tokenization:
    corpus['tokens'] = corpus['text'].apply(lambda x: tokenize(x))
    queries['tokens'] = queries['text'].apply(lambda x: tokenize(x))

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/atappy/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [6]:
token_dir = "Data/tokens/"

# Save tokenized corpus by splitting into gittable/small files
if F_do_tokenization:
    n_sample_per_file = 70000
    n_sample = len(corpus)
    n_files =  n_sample//n_sample_per_file + 1

    for i in range(0, n_files):
        first = i*n_sample_per_file
        last = min(first + n_sample_per_file, n_sample)
        corpus_tokens_part = corpus["tokens"].iloc[first:last]
        
        filename = f"{token_dir}corpus_tokens_{i:02d}.pkl"
        corpus_tokens_part.to_pickle(filename)
    
    queries["tokens"].to_pickle(f"{token_dir}queries_tokens.pkl")
    print("Files saved")

In [12]:
# Load tokenized corpus as a dataframe
if not F_do_tokenization:
    corpus = pd.DataFrame()
    queries = pd.DataFrame()
    files_paths=os.listdir(token_dir)
    corpus_tokens_paths= [f"{token_dir+path}" for path in files_paths if 'corpus_tokens' in path and ".pkl" in path]
    corpus_tokens_paths.sort()
    dfs = [pd.read_pickle(path) for path in corpus_tokens_paths]
    corpus["tokens"] = pd.concat(dfs)
    queries["tokens"] = pd.read_pickle(f"{token_dir}queries_tokens.pkl")
    print("Files loaded")

Files loaded


#### 2.3 Explore vocabulary

#### 2.3.0 Check vocab by document

In [18]:
vocab_index = 4
tokens = corpus.iloc[0:vocab_index]['tokens']
vocab_custom = sorted(list(set([ x for y in tokens for x in y])))
if not F_do_tokenization:
    texts = corpus.iloc[0:vocab_index]['tokens'].apply(lambda x: ' '.join(x))
else:
    texts = corpus.iloc[0:vocab_index]['text']

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
tf.fit_transform(texts)
vocab_cheat = sorted(list(tf.vocabulary_.keys()))

In [19]:
print("exclusive to custom vocab")
for v in vocab_custom :
    if v not in vocab_cheat:
        print(v)

exclusive to custom vocab
after
americaâ
around
by
detail
for
in
may
one
per
still
the
you


In [20]:
print("exclusive to cheat vocab")
for v in vocab_cheat :
    if v not in vocab_custom :
        print(v)

exclusive to cheat vocab
americaâ


##### 2.3.1 Custom vocab

In [24]:
vocabulary = list(set([ x for y in corpus['tokens'] for x in y]))
vocabulary.sort()

##### 2.3.2 Cheat vocab

In [26]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
tf.fit_transform(corpus['text'])
vocabulary_cheat = sorted(list(tf.vocabulary_.keys()))

#### 2.4 Save vocabularies

In [25]:
with open(r'Data/vocabulary.txt', 'w') as fp:
    for item in vocabulary:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'Data/vocabulary_cheat.txt', 'w') as fp:
    for item in vocabulary_cheat:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


### 3. Vectorization

#### 3.1 Define vectorization functions 

##### 3.1.1 Inverse Document Frequency - IDF

In [None]:
def compute_idf(documents):
    """
    Compute Inverse Document Frequency (IDF) for each term in all documents 
    
    documents : numpy array of tokens
    """
    
    total_documents = len(documents)
    word_document_count = defaultdict(int)

    average_number_words = 0 #global variable to compute average number of distinct words per document

    for document in documents:
        unique_words = set(document)
        average_number_words += len(unique_words)
        for word in unique_words:
            word_document_count[word] += 1
    average_number_words = average_number_words / total_documents

    idf = {}
    for word, count in word_document_count.items():
        idf[word] = math.log(total_documents / (count))

    return average_number_words, defaultdict(float,idf)

##### 3.1.2 Term frequency - TF

In [None]:
def compute_tf(document, average_number_words, s = 0.2):
    """
    Compute Term Frequency (TF) for each term in a document and normalize it using the pivoted unique query normalization

    document : 
    average_number_words :
    s : normalization parameter

    """
    word_counts = Counter(document)
    unique_words_count = len(set(document)) # TODO : Use already computed unique_words_count in IDF ?
    tf = {word: (count / max(word_counts.values())) / ((1.0-s)*average_number_words + s*unique_words_count) for word, count in word_counts.items()}
    return tf

##### 3.1.3 Vectorize a document

In [None]:
def vectorize(tokens, idf, average_number_words):
    """
    Compute TF-IDF weights for each term in all documents -> vectorize each document
    """
    vector = {}
    tf = compute_tf(tokens, average_number_words)
    vector.update({word: tf[word] * idf[word] for word in tf.keys()})
    return vector

#### 3.2 Vectorize corpus and queries

In [None]:
corpus_tokens_array = corpus['tokens'].to_numpy()
queries_tokens_array = queries['tokens'].to_numpy()
average_number_words, idf = compute_idf(corpus_tokens_array)
vectorized_corpus = [vectorize(document_tokens, idf, average_number_words) for document_tokens in corpus_tokens_array]
vectorized_queries = [vectorize(query_tokens, idf, average_number_words) for query_tokens in queries_tokens_array]

### 4. Search with query

#### 4.1 Define searching function

##### 4.1.1 Cosine similarity function

In [None]:
# Function to compute cosine similarity
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
# y to default dict ?
    v2 = defaultdict(float, v2)
    for i in v1:
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result = sumxy/math.sqrt(sumxx*sumyy)
    return  result 

##### 4.1.3 Find k most relevant document for query

In [None]:
def k_search(query_vector, vectorized_corpus, corpus_ids, k=10):
    similarities = np.array([cosine_similarity(query_vector, doc_vec) for doc_vec in vectorized_corpus])
    corpus_ids = corpus_ids[similarities.argsort()[::-1]]

    return (corpus_ids[:k].to_list())

##### 4.2 Example : Search for 1st query

In [None]:
query_vector = vectorized_queries[0]
print(k_search(query_vector, vectorized_corpus, corpus.index))

#### 4.3 Find most relevant documents for all queries 

In [None]:
for q_id, query_vector in zip(queries.index.to_list(), vectorized_queries):
    docs_id = k_search(query_vector, vectorized_corpus, corpus.index, 1)
    true_doc_id = train_set.loc[q_id]["corpus-id"]
    