In [None]:
#import necessary packages and modules
import ast
import re
import numpy as np
import pandas as pd
from preprocessing_utils import *

# reading the contractions data from text file
with open('contractions.txt') as f:
    contractions = f.read()
print("Data type before reconstruction : ", type(contractions))
      
# reconstructing the data as a dictionary
contractions_dict = ast.literal_eval(contractions)
  
print("Data type after reconstruction : ", type(contractions_dict))

In [None]:
#load training and testing queries.
train_queries = pd.read_csv("train_queries.csv")
test_queries = pd.read_csv("test_queries.csv")

#load training and testing corpus.
train_corpus = pd.read_csv("training_corpus.csv")
test_corpus = pd.read_csv("testing_corpus.csv")

In [None]:
# Lowercasing the text
train_corpus['cleaned'] = train_corpus['body'].apply(lambda x:x.lower())
test_corpus['cleaned'] = test_corpus['body'].apply(lambda x:x.lower())

# Expanding Contractions
train_corpus['cleaned']=train_corpus['cleaned'].apply(lambda x:expand_contractions(x))
test_corpus['cleaned']=test_corpus['cleaned'].apply(lambda x:expand_contractions(x))

In [None]:
# Clean text
train_corpus['cleaned']=train_corpus['cleaned'].apply(lambda x: clean_text(x))
test_corpus['cleaned']=test_corpus['cleaned'].apply(lambda x: clean_text(x))

In [None]:
# Stopwords removal & Lemmatizing tokens using SpaCy
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load(disable=['ner','parser'])
nlp.max_length=5000000

from tqdm.notebook import tqdm
tqdm.pandas()

# Removing Stopwords and Lemmatizing words
train_corpus['lemmatized']=train_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
test_corpus['lemmatized']=test_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

- **We have now pre-processed our documents. It’s time to pre-process our queries.**

In [None]:
# Lowercasing the text
train_queries['cleaned']=train_queries['query'].apply(lambda x:x.lower())
test_queries['cleaned']=test_queries['query'].apply(lambda x:x.lower())

# Expanding contractions
train_queries['cleaned']=train_queries['cleaned'].apply(lambda x:expand_contractions(x))
test_queries['cleaned']=test_queries['cleaned'].apply(lambda x:expand_contractions(x))

# Cleaning queries using RegEx
train_queries['cleaned']=train_queries['cleaned'].apply(lambda x: clean_text(x))
test_queries['cleaned']=test_queries['cleaned'].apply(lambda x: clean_text(x))

# Removing extra spaces
train_queries['cleaned']=train_queries['cleaned'].apply(lambda x: re.sub(' +',' ',x))
test_queries['cleaned']=test_queries['cleaned'].apply(lambda x: re.sub(' +',' ',x))

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([train_corpus.rename(columns={'lemmatized':'text'})['text'],\
                             train_queries.rename(columns={'cleaned':'text'})['text']])\
                             .sample(frac=1).reset_index(drop=True)

- We will train a word2vec model and generate vectors for documents and queries in the testing set for information retrieval. But before that, there is a need to prepare the dataset for training the word2vec model. 
- Please note, we have already created the training set, but we want to use the same word2vec model for generating vectors for both documents and queries. Thus, we will combine both documents and queries to create a single dataframe.

In [None]:
from gensim.models import Word2Vec

#Creating data for the model training
train_data=[]
for i in combined_training:
    train_data.append(i.split())

# Training a word2vec model from the train data set.
w2v_model = Word2Vec(train_data, vector_size=300, min_count=2,window=5, sg=1,workers=4)
w2v_model.save("models/word2vec.model")

# Vocabulary size
print('Vocabulary size:', len(w2v_model.wv.index_to_key))

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Getting Word2Vec Vectors for Testing Corpus.
test_corpus['embeddings']=test_corpus['lemmatized'].progress_apply(lambda x :aggregate_embedding_w2v(w2v_model,x.split()))
# Getting Word2Vec Vectors for Training Corpus.
train_corpus['embeddings']=train_corpus['lemmatized'].progress_apply(lambda x :aggregate_embedding_w2v(w2v_model,x.split()))

In [None]:
#save the embeddings of the documents in test set.
test_embeddings = np.concatenate(test_corpus.embeddings.values,axis=0)
np.save('data/word2vec/test_embeddings.npy',test_embeddings)

#save the embeddings of the documents in train set.
train_embeddings = np.concatenate(train_corpus.embeddings.values,axis=0)
np.save('data/word2vec/train_embeddings.npy',train_embeddings)