# Acting on test data, using the AKB 2020_09_27 model

In [2]:
DSET_FOLDER_PATH = './dataset/quora/'
GLOVE_FOLDER_PATH = './embeddings/glove/'
MODEL_PATH = "pickledModels/2020_09_28_pickled_model.joblib"

## 1. Collecting NLTK and the dataset

In [3]:
import nltk
# nltk.download()

In [4]:
import numpy as np 
import matplotlib.pyplot as plt  
import pandas as pd 
import seaborn as sns 
import contractions

In [5]:
test_data_df = pd.read_csv(DSET_FOLDER_PATH + 'test.csv')

In [6]:
test_data_df.head()

Unnamed: 0,qid,question_text
0,f56a9a31974dc66186e8,Is it a good idea to go through a phlebotomy c...
1,d957c3758060f45da303,How can I fix a lead into a camlin compass to ...
2,ad822d5abaedb9e247b9,How many animes are there in world?
3,4e979c23eeb6a4bd1f2e,How do I tell my family I cut?
4,333cc031262566b8da49,How do I save down my bitcoin image address fr...


## 2. Steps of pre-embedding preprocessing:


In [7]:
class Preprocessor_AKB:
    def __init__(self):
        import nltk
        import contractions 
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        self.stopwords_corpus = set(nltk.corpus.stopwords.words())
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
    def preprocess(self,sentence):
        sentence = sentence.lower()
        sentence = contractions.fix(sentence)
        sentence = self.tokenizer.tokenize(sentence)
        sentence = [word for word in sentence if not word in self.stopwords_corpus]
        sentence = sentence = [self.lemmatizer.lemmatize(word) for word in sentence]
        return sentence

In [8]:
preprocessor = Preprocessor_AKB()

## 3. GloVe Embeddings

### 3.1. Importing the embeddings

In [9]:
class Glove_Embedder:
    def __init__(self, PATH_TO_TEXTFILE):
        self.glove_embeddings_dict = {}
        glove_embeddings_file = open(PATH_TO_TEXTFILE, 'r')
        firstTime = True
        while True:
            line = glove_embeddings_file.readline()
            if not line:
                break
            splitted = line.split()
            key = splitted[0]
            value = np.array([float(i) for i in splitted[1:]])
            if(firstTime):
                firstTime = False 
                self.embedding_vector_size = value.size
            self.glove_embeddings_dict[key] = value
        glove_embeddings_file.close()
    def get_embedding_for_sentence(self, sentence_list):
        '''
        The sentence should be lowercased and free of special characters and numbers. Ideally, it should be lemmatized, too. The sentence should be a list of words.
        '''
        number_of_words = len(sentence_list)
        embedding = np.zeros((self.embedding_vector_size, ))
        if(number_of_words == 0):
            return embedding 
        for word in sentence_list:
            if word in self.glove_embeddings_dict:
                embedding += self.glove_embeddings_dict[word]
        embedding /= number_of_words
        return embedding.tolist()
            
        

In [10]:
embedder = Glove_Embedder(GLOVE_FOLDER_PATH + "glove.6B.50d.txt")

## 4. Putting it all together to obtain ndarrays 

In [11]:
from tqdm import tqdm 
tqdm.pandas()

In [12]:
test_data_df.question_text = test_data_df.question_text.progress_apply(preprocessor.preprocess)

100%|██████████| 522449/522449 [00:19<00:00, 26710.31it/s]


In [13]:
test_data_df.question_text = test_data_df.question_text.progress_apply(embedder.get_embedding_for_sentence)

100%|██████████| 522449/522449 [00:07<00:00, 73387.68it/s]


In [14]:
test_X = np.array(test_data_df.question_text.to_list())

## 5. Training a polynomial-kernel SVM on the dataset

In [15]:
import pickle
import joblib

In [16]:
pickled_model = joblib.load(MODEL_PATH)
log_reg = pickle.loads(pickled_model)

In [17]:
import sklearn.linear_model 
import sklearn.metrics

In [18]:
test_Yhat = log_reg.predict(test_X)

In [19]:
test_Yhat

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
test_data_df.question_text = test_Yhat

In [26]:
test_dset_df = test_dset_df.rename(columns={"qid":"qid", "question_text":"target"})

In [28]:
test_data_df.to_csv("2020_09_28_testset_output.csv", index=False)
