## Imports and embedding
See 2020_10_28_clustering_and_visualization_glove.ipynb

In [1]:
class Glove_Embedder:
    def __init__(self, PATH_TO_TEXTFILE):
        self.glove_embeddings_dict = {}
        glove_embeddings_file = open(PATH_TO_TEXTFILE, 'r')
        firstTime = True
        while True:
            line = glove_embeddings_file.readline()
            if not line:
                break
            splitted = line.split()
            key = splitted[0]
            value = np.array([float(i) for i in splitted[1:]])
            if(firstTime):
                firstTime = False 
                self.embedding_vector_size = value.size
            self.glove_embeddings_dict[key] = value
        glove_embeddings_file.close()
    def get_embedding_for_sentence(self, sentence_list):
        '''
        The sentence should be lowercased and free of special characters and numbers. Ideally, it should be lemmatized, too. The sentence should be a list of words.
        '''
        number_of_words = len(sentence_list)
        embedding = np.zeros((self.embedding_vector_size, ))
        if(number_of_words == 0):
            return embedding 
        for word in sentence_list:
            if word in self.glove_embeddings_dict:
                embedding += self.glove_embeddings_dict[word]
        embedding /= number_of_words
        return embedding.tolist()
    def get_embedding_for_word(self, word):
        if word in self.glove_embeddings_dict:
            embedding = self.glove_embeddings_dict[word]
        else:
            embedding = np.zeros((self.embedding_vector_size, ))
        return embedding.tolist()

In [2]:
import time
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))

In [3]:
import nltk
class Stopword_Remover:
    def __init__(self):
        self.stopwordCorpus = set(nltk.corpus.stopwords.words())
    def stopword_removed(self, sentence_str):
        return " ".join([word for word in sentence_str.split(" ") if not word in self.stopwordCorpus])
srem = Stopword_Remover()
train_dset_df["preprocessed_joined"] = train_dset_df["preprocessed_joined"].progress_apply(srem.stopword_removed)


100%|██████████| 783673/783673 [00:02<00:00, 357938.65it/s]


In [4]:
unique_words_list = sorted(list(set((" ".join(train_dset_df["preprocessed_joined"].tolist())).split(" "))))

In [5]:
len(unique_words_list)

54264

In [6]:
embedder = Glove_Embedder("./embeddings/glove/glove.6B.50d.txt")

In [7]:
embedded_words_list = []
embeddings_list = []
for word in unique_words_list:
    curr_embedding = embedder.get_embedding_for_word(word)
    if not(np.all(np.array(curr_embedding) == 0)):
        embedded_words_list.append(word)
        embeddings_list.append(curr_embedding)

In [8]:
len(embedded_words_list)

52857

In [9]:
len(embeddings_list)

52857

## Clustering

In [10]:
embeddings_nd = np.array(embeddings_list)

In [11]:
from sklearn.cluster import AgglomerativeClustering

In [12]:
aggcls = AgglomerativeClustering(n_clusters=10, linkage="ward", affinity="euclidean")

In [13]:
labels = aggcls.fit_predict(embeddings_nd)

In [14]:
labels

array([0, 5, 7, ..., 3, 3, 2])

Well, that worked. Even though it took 26GB to handle.

In [15]:
groupable_dict =   {"words": embedded_words_list, "labels":labels.tolist() }

In [16]:
import pandas as pd 

In [17]:
labels_df = pd.DataFrame(groupable_dict)

In [18]:
labels_df.head()

Unnamed: 0,words,labels
0,aaa,0
1,aaaa,5
2,aab,7
3,aac,3
4,aach,2


In [19]:
grouped_labels_df = labels_df.groupby(by="labels")

In [20]:
for name, group in grouped_labels_df:
    with open("2020_11_5_agglo_group_"+str(name)+".txt", "w") as file_handle:
        words_str = "\n".join(group["words"].to_list())
        file_handle.write(words_str + '\n')