## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

train_dset_y = train_dset_df["target"].to_numpy()

## Embedder function

In [3]:
def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X


## Embedding and Standardization

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
word2vec = gensim_api.load("word2vec-google-news-300")
train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)
scaler = StandardScaler()
train_embedded_X = scaler.fit_transform(train_embedded_X)

100%|██████████| 783673/783673 [01:13<00:00, 10712.28it/s]


In [6]:
word2vec = None 
scaler = None

In [7]:
gc.collect()

0

## Trying clustering

In [8]:
from hdbscan import HDBSCAN

In [9]:
clusterer = HDBSCAN(metric="manhattan")

In [10]:
clusterer.fit(train_embedded_X)