In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
nltk.download('punkt')


from word2vec_selection.functions import *

In [None]:
# if data not saved as csv, run this
import os
if not os.path.exists('data/merged_titles_labels.csv'):
    df1 = pd.read_csv('../eda/small1/labeled.csv')
    df2 = pd.read_csv('../eda/small2/labeled.csv')
    df3 = pd.read_csv('../eda/small3/labeled.csv')
    df = pd.concat([df1, df2, df3], ignore_index=True).reset_index(drop=True)
    df.to_csv('data/merged_titles_labels.csv', index=False)
    df.head()
else:
    df = pd.read_csv('data/merged_titles_labels.csv')
df.head()

In [None]:
%run -i "word2vec_selection/functions.py"
if not os.path.exists('data/preprocessed_titles_labels.pkl'):
    df = preprocess_title(df, verbose=True)
    df.to_pickle('data/preprocessed_titles_labels.pkl') 

else:
    df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

In [None]:
df['title'][0]

In [34]:
df['sample'].value_counts()

sample
train    66664
test      7407
Name: count, dtype: int64

In [35]:
df_train = df[df['sample']=='train'].reset_index(drop=True)

In [36]:
df_train['sample'].value_counts()

sample
train    66664
Name: count, dtype: int64

In [38]:
from tqdm.notebook import tqdm
import time

EPOCHS = 500
WORKERS = 16
MIN_COUNT = 1


# train many word2vec models with diferent VECTOR_SIZE and WINDOW

VECTOR_SIZEs = [
    # 10,
    # 20,
    # 50,
    100, 
    250, 
    # 500, 
    # 1000, 
   
]

WINDOWs = [
    3, 
    4, 
    5, 
    6, 
    7, 
    8
]
SGs = [0, 1]


##################################################
# # --uncomment for sample model training--
# EPOCHS = 200
# VECTOR_SIZEs = [500]
# WINDOWs = [4]
# SGs = [0]
##################################################


print('Start training')
# sleep 200 ms
time.sleep(0.2)

for VECTOR_SIZE in tqdm(VECTOR_SIZEs):
    print(f'Current VECTOR_SIZE: {VECTOR_SIZE}')
    for WINDOW in tqdm(WINDOWs, desc=f'WINDOW'):
        for sg in tqdm(SGs, desc=f'SG'):
            # check if model already trained
            if os.path.exists(f'word2vec_models/word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model'):
                print(f'word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model already exists')
            else:
                model = Word2Vec(df_train['title'], vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS, sg=sg)
                model.train(df_train['title'], total_examples=len(df_train['title']), epochs=EPOCHS)
                model.save(f'word2vec_models/word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model')

Start training


  0%|          | 0/3 [00:00<?, ?it/s]

Current VECTOR_SIZE: 10


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 20


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 50


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

In [39]:
model.wv.most_similar('trump')

[('donald', 0.927785336971283),
 ('borderer', 0.8330757021903992),
 ('mcgahn', 0.8120999336242676),
 ('gigapixel', 0.771509051322937),
 ('howd', 0.7592917084693909),
 ('warmongering', 0.7389071583747864),
 ('peacenik', 0.7371304035186768),
 ('unknowable', 0.7279512882232666),
 ('backslides', 0.7218949794769287),
 ('impulsive', 0.7153481245040894)]