# Embedding AND CountV -- SGDC

## Imports and preprocessing

In [1]:
import gc
import numpy as np 
import gensim.downloader as gensim_api 
from tqdm import tqdm 
tqdm.pandas()
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.metrics import f1_score, plot_confusion_matrix, confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer


def summarize(y, yhat):
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))
    return (f1_score(y, yhat))

def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)

word2vec = gensim_api.load("word2vec-google-news-300")
train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)
word2vec = None 
[gc.collect() for i in range(3)]

vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
train_bow_X = vectorizer.transform(train_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()
train_dset_df = None 
[gc.collect() for i in range(3)]


100%|██████████| 783673/783673 [01:13<00:00, 10682.96it/s]


[0, 0, 0]

## Hstack and Conversion to CSR

In [3]:
NUM_DATA_POINTS = train_embedded_X.shape[0]
FINAL_WIDTH = train_embedded_X.shape[1] + train_bow_X.shape[1]


In [4]:
BLOCK_SIZE = 5000

In [5]:
import scipy.sparse

In [6]:
final_X = []

In [7]:
i = 0
while(train_embedded_X.shape[0] > 0):
    if(i % 10 == 0):
        print(i*BLOCK_SIZE)
    i += 1
    row_end_index = min(train_embedded_X.shape[0], BLOCK_SIZE)
    final_X.append(scipy.sparse.hstack([train_embedded_X[:row_end_index,:], train_bow_X[:row_end_index,:]]).tocsr())
    train_embedded_X = train_embedded_X[row_end_index:,:]
    [gc.collect() for _ in range(3)]
    train_bow_X = train_bow_X[row_end_index:,:]
    [gc.collect() for _ in range(3)]
    

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000


In [8]:
train_bow_X = None

In [9]:
train_embedded_X = None

In [10]:
k = 0
while len(final_X) > 1:
    print("Outside", k)
    k += 1
    i = 0
    while(i < len(final_X)):
        print("Inside", i)
        if(i < len(final_X) - 1):
            final_X[i] = scipy.sparse.vstack([final_X[i], final_X[i+1]])
            final_X.pop(i+1)
            [gc.collect() for _ in range(3)]
        i += 1

Outside 0
Inside 0
Inside 1
Inside 2
Inside 3
Inside 4
Inside 5
Inside 6
Inside 7
Inside 8
Inside 9
Inside 10
Inside 11
Inside 12
Inside 13
Inside 14
Inside 15
Inside 16
Inside 17
Inside 18
Inside 19
Inside 20
Inside 21
Inside 22
Inside 23
Inside 24
Inside 25
Inside 26
Inside 27
Inside 28
Inside 29
Inside 30
Inside 31
Inside 32
Inside 33
Inside 34
Inside 35
Inside 36
Inside 37
Inside 38
Inside 39
Inside 40
Inside 41
Inside 42
Inside 43
Inside 44
Inside 45
Inside 46
Inside 47
Inside 48
Inside 49
Inside 50
Inside 51
Inside 52
Inside 53
Inside 54
Inside 55
Inside 56
Inside 57
Inside 58
Inside 59
Inside 60
Inside 61
Inside 62
Inside 63
Inside 64
Inside 65
Inside 66
Inside 67
Inside 68
Inside 69
Inside 70
Inside 71
Inside 72
Inside 73
Inside 74
Inside 75
Inside 76
Inside 77
Inside 78
Outside 1
Inside 0
Inside 1
Inside 2
Inside 3
Inside 4
Inside 5
Inside 6
Inside 7
Inside 8
Inside 9
Inside 10
Inside 11
Inside 12
Inside 13
Inside 14
Inside 15
Inside 16
Inside 17
Inside 18
Inside 19
Inside 20


In [11]:
len(final_X)

1

In [12]:
final_X

[<783673x56172 sparse matrix of type '<class 'numpy.float64'>'
 	with 949351499 stored elements in Compressed Sparse Row format>]

In [13]:
final_X=final_X[0]