## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

train_dset_y = train_dset_df["target"].to_numpy()

In [3]:
NUM_TRAIN_EXAMPLES = len(train_dset_df)

## Embedder function

In [4]:
def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X


## Embedding and Standardization

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
word2vec = gensim_api.load("word2vec-google-news-300")
train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)
scaler = StandardScaler()
train_embedded_X = scaler.fit_transform(train_embedded_X)

100%|██████████| 783673/783673 [01:11<00:00, 10892.28it/s]


In [9]:
word2vec = None 
scaler = None

In [10]:
gc.collect()

0

# PCA on embeddings

In [11]:
# from sklearn.decomposition import PCA

In [12]:
# pca_er = PCA(n_components=2)
# train_pca_ed_X = pca_er.fit_transform(train_embedded_X)
# train_pca_ed_X_zero = train_pca_ed_X[train_dset_y == 0,:]
# train_pca_ed_X_one = train_pca_ed_X[train_dset_y == 1,:]

In [13]:
# plt.figure(figsize=(32,18))
# plt.scatter(train_pca_ed_X_zero[:,0], train_pca_ed_X_zero[:,1], s=1, color="blue")
# plt.scatter(train_pca_ed_X_one[:,0], train_pca_ed_X_one[:,1], s=1, color="red")
# plt.show()
# plt.savefig("2020_11_14_pca.png")


In [14]:
# pca_er = None 
# train_pca_ed_X = None 
# train_pca_ed_X_one = None 
# train_pca_ed_X_zero = None 


In [15]:
# gc.collect()

## Could the L-X norm be a feature for this?

Ans: Maybe if we had nonlinear models/as a last-ditch.

In [16]:
# gc.collect()

In [17]:
# norms = np.linalg.norm(train_embedded_X, axis=1, ord=2)

In [18]:
# norms_zero = norms[train_dset_y == 0]
# norms_one = norms[train_dset_y == 1]

In [19]:
# import seaborn as sns 

In [20]:
# sns.displot(norms_zero, color="blue")
# sns.displot(norms_one, color="red")
# plt.show()

In [21]:
# sns.histplot(norms_one)

In [22]:
# np.mean(norms_zero)

In [23]:
# np.mean(norms_one)

In [24]:
# np.min(norms_zero)

In [25]:
# np.min(norms_one)

In [26]:
# np.max(norms_zero)

In [27]:
# np.max(norms_one)

In [28]:
# np.median(norms_zero)

In [29]:
# np.median(norms_one)

In [30]:
# np.var(norms_zero)

In [31]:
# np.var(norms_one)

In [32]:
# sns.boxplot(norms_zero, color="blue")

In [33]:
# sns.boxplot(norms_one, color="red")

## Trying clustering

In [34]:
from sklearn.cluster import KMeans

In [35]:
clusterer = KMeans(n_clusters = 100)

In [36]:
cluster_labels = clusterer.fit_predict(train_embedded_X)

In [38]:
cluster_labels

array([58, 81, 32, ..., 54, 47, 49], dtype=int32)