In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
nltk.download('punkt')


%run '../functions.py'
%run '../classes.py'


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data..

In [11]:
# if data not saved as csv, run this
import os
if not os.path.exists('data/merged_titles_labels.csv'):
    df1 = pd.read_csv('../eda/small1/labeled.csv')
    df2 = pd.read_csv('../eda/small2/labeled.csv')
    df3 = pd.read_csv('../eda/small3/labeled.csv')
    df = pd.concat([df1, df2, df3], ignore_index=True).reset_index(drop=True)
    df.to_csv('data/merged_titles_labels.csv', index=False)
    df.head()
else:
    df = pd.read_csv('data/merged_titles_labels.csv')
df.head()

Unnamed: 0,title,is_clickbait,text,dataset,sample
0,7 Essays To Read: Sex Scenes That Are Actually...,1,,clickbait-dataset,train
1,The Horrible Truth About Bagels,1,,clickbait-dataset,train
2,Entire Dem/Lib Party is Infested with Rats! So...,1,"— Susan? (@GaetaSusan) October 27, 2016 WikiLe...",fake-news,train
3,"Beginning a New Life in America, With Mentors ...",0,"OAKLAND, Calif. — Pascal Serugendo was only...",fake-news,train
4,Imam pleads guilty in New York subway bomb plot,0,,clickbait-dataset,train


In [12]:
if not os.path.exists('data/preprocessed_titles_labels.pkl'):
    df = preprocess_title(df, verbose=True)
    df.to_pickle('data/preprocessed_titles_labels.pkl') 

else:
    df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait,text,dataset,sample
0,"[seven, essay, read, sex, scene, actually, sex...",1,,clickbait-dataset,train
1,"[horrible, truth, bagel]",1,,clickbait-dataset,train
2,"[entire, demlib, party, infested, rat, much, c...",1,"— Susan? (@GaetaSusan) October 27, 2016 WikiLe...",fake-news,train
3,"[beginning, new, life, america, mentor, side, ...",0,"OAKLAND, Calif. — Pascal Serugendo was only...",fake-news,train
4,"[imam, pleads, guilty, new, york, subway, bomb...",0,,clickbait-dataset,train


In [13]:
df['title'][0]

['seven',
 'essay',
 'read',
 'sex',
 'scene',
 'actually',
 'sexy',
 'confident',
 'black',
 'men',
 'debt']

In [14]:
df['sample'].value_counts()

sample
train    46618
test      2590
val2      1296
val1      1295
Name: count, dtype: int64

In [15]:
df_train = df[df['sample']=='train'].reset_index(drop=True)

In [16]:
df_train['sample'].value_counts()

sample
train    46618
Name: count, dtype: int64

In [17]:
from tqdm.notebook import tqdm
import time

EPOCHS = 500
WORKERS = 6
MIN_COUNT = 1


# train many word2vec models with diferent VECTOR_SIZE and WINDOW

VECTOR_SIZEs = [
    10,
    20,
    50,
    100, 
    250, 
    # 500, 
    # 1000, 
   
]

WINDOWs = [
    3, 
    4, 
    5, 
    6, 
    7, 
    8
]
SGs = [0, 1]


#################################################
# --uncomment for sample model training--
# EPOCHS = 100
# VECTOR_SIZEs = [10]
# WINDOWs = [5]
# SGs = [1]
#################################################


print('Start training')
# sleep 200 ms
time.sleep(0.2)

for VECTOR_SIZE in tqdm(VECTOR_SIZEs):
    print(f'Current VECTOR_SIZE: {VECTOR_SIZE}')
    for WINDOW in tqdm(WINDOWs, desc=f'WINDOW'):
        for sg in tqdm(SGs, desc=f'SG'):
            # check if model already trained
            if os.path.exists(f'word2vec_models/word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model'):
                print(f'word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model already exists')
            else:
                model = Word2Vec(df_train['title'], vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS, sg=sg)
                model.train(df_train['title'], total_examples=len(df_train['title']), epochs=EPOCHS)
                model.save(f'word2vec_models/word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model')

Start training


  0%|          | 0/5 [00:00<?, ?it/s]

Current VECTOR_SIZE: 10


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 20


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 50


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 100


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 250


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.wv.most_similar('trump')

# save vocabulary
vocab = list(model.wv.index_to_key)

with open('data/vocab.txt', 'w') as f:
    for word in vocab:
        f.write(word+'\n')