## Imports, classdefs, stopword removal

In [1]:
class Glove_Embedder:
    def __init__(self, PATH_TO_TEXTFILE):
        self.glove_embeddings_dict = {}
        glove_embeddings_file = open(PATH_TO_TEXTFILE, 'r')
        firstTime = True
        while True:
            line = glove_embeddings_file.readline()
            if not line:
                break
            splitted = line.split()
            key = splitted[0]
            value = np.array([float(i) for i in splitted[1:]])
            if(firstTime):
                firstTime = False 
                self.embedding_vector_size = value.size
            self.glove_embeddings_dict[key] = value
        glove_embeddings_file.close()
    def get_embedding_for_sentence(self, sentence_list):
        # print(sentence_list)
        '''
        The sentence should be lowercased and free of special characters and numbers. Ideally, it should be lemmatized, too. The sentence should be a list of words.
        '''
        number_of_words = len(sentence_list)
        embedding = np.zeros((self.embedding_vector_size, ))
        if(number_of_words == 0):
            return embedding 
        for word in sentence_list:
            # print(word)
            if word in self.glove_embeddings_dict:
                embedding += self.glove_embeddings_dict[word]
        embedding /= number_of_words
        return embedding.tolist()
    def get_embedding_for_word(self, word):
        if word in self.glove_embeddings_dict:
            embedding = self.glove_embeddings_dict[word]
        else:
            embedding = np.zeros((self.embedding_vector_size, ))
        return embedding.tolist()

import time
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))

embedder = Glove_Embedder("./embeddings/glove/glove.6B.50d.txt")
import nltk
class Stopword_Remover:
    def __init__(self):
        self.stopwordCorpus = set(nltk.corpus.stopwords.words())
    def stopword_removed(self, sentence_str):
        return " ".join([word for word in sentence_str.split(" ") if not word in self.stopwordCorpus])
srem = Stopword_Remover()
train_dset_df["preprocessed_stopword_removed"] = train_dset_df["preprocessed_joined"].progress_apply(lambda x: (srem.stopword_removed(x).split()))

100%|██████████| 783673/783673 [00:03<00:00, 248351.92it/s]


In [2]:
preprocessed_stopword_removed = train_dset_df["preprocessed_stopword_removed"]

In [3]:
list_of_sentences = preprocessed_stopword_removed.to_list()

In [5]:
N_SENTENCES = len(list_of_sentences)
EMBEDDING_SIZE = 50

In [6]:
embeddings = np.zeros((N_SENTENCES, EMBEDDING_SIZE))

In [7]:
for i in tqdm(range(N_SENTENCES)):
    sentence = list_of_sentences[i]
    embeddings[i,:] = embedder.get_embedding_for_sentence(sentence)

100%|██████████| 783673/783673 [00:20<00:00, 38788.55it/s]
