In [None]:
import gensim.downloader as api
from sklearn.manifold import TSNE
import re
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score

import altair as alt

!pip install datasets
import datasets

In [None]:
wv = api.load('word2vec-google-news-300')

### Visualize the most frequent words


In [None]:
# take top 3000 words 
N = 5000
subset_words = wv.index2entity[:N]

In [None]:
vecs = np.zeros((N, 300))
for i, w in enumerate(subset_words):
  vecs[i,:] = wv[w]

In [None]:
tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate = 100)
Y = tsne.fit_transform(vecs)

In [None]:
df = pd.DataFrame(Y)
df.columns = ['x', 'y']
df["word"] = subset_words

In [None]:
chart = alt.Chart(df).mark_circle().encode(
    x='x',
    y='y',
    tooltip=['word']).interactive().properties(
    width=800,
    height=800
)

chart.save('word2vec.html')

## Text classification

In [61]:
# same as in tdidf notebook
dataset = datasets.load_dataset("tweet_eval", "irony")
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()
df_test = dataset["test"].to_pandas()

def transform_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9 ]+', '', text)
  text = re.sub('\s+(a|is|be|will|the|was|were|have|has|are|been|s|ll)\s+', '', text)
  return text

def create_documents_list(l):
  temp_vocab = [i.split(' ') for i in l]
  documents_list = [[j for j in i if len(j)>0] for i in temp_vocab]
  return documents_list

df_train.text = df_train.text.apply(lambda x: transform_text(x))
df_test.text = df_test.text.apply(lambda x: transform_text(x))
df_val.text = df_val.text.apply(lambda x: transform_text(x))

documents_list_train = create_documents_list(df_train.text.to_list())
documents_list_val = create_documents_list(df_val.text.to_list())
documents_list_test = create_documents_list(df_test.text.to_list())

Reusing dataset tweet_eval (/root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [93]:
# create function that takes average embeding per document
def compute_embeddings_per_document(document, wv):
  n_dims = wv[wv.index2entity[0]].shape[0]
  result = np.zeros((len(document), n_dims))
  for i, w in enumerate(document):
    if w in wv:
      result[i, :] = wv[w]
  result = result.mean(axis=0)
  return result 

def get_w2v_features(documents_list, wv):
  n_dims = wv[wv.index2entity[0]].shape[0]
  result = np.zeros((len(documents_list), n_dims))
  for i, d in enumerate(documents_list):
    result[i, :] = compute_embeddings_per_document(d, wv)
  return result

In [94]:
X = get_w2v_features(documents_list_train, wv)
y = df_train.label

In [95]:
clf = LogisticRegression()
clf.fit(X, y )
y_hat = clf.predict(X)
f1_score(y,y_hat)

0.665031534688157

In [96]:
#validation
y_hat_val = clf.predict(get_w2v_features(documents_list_val, wv))
f1_score(df_val.label,y_hat_val)

0.6226622662266227

In [92]:
# test
y_hat_test = clf.predict(get_w2v_features(documents_list_test, wv))
f1_score(df_test.label,y_hat_test)