In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
df1 = pd.read_csv('eda/small1/labeled.csv')
df2 = pd.read_csv('eda/small2/labeled.csv')
df3 = pd.read_csv('eda/small3/labeled.csv')
df = pd.concat([df1, df2, df3], ignore_index=True).reset_index(drop=True)
df.to_csv('data/merged_titles_labels.csv', index=False)
df.head()

Unnamed: 0,title,is_clickbait
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [51]:
# remove punctuation
import string
punct = string.punctuation
punct = punct
all_quoatation = ['“', '”', '‘', '’']

def remove_punct(text, punct=punct):
    for p in punct:
        text = text.replace(p, '')
    text = text.replace('...', ' ')
    text = text.replace('…', ' ')
    return text

def remove_possesive_s(text):
    for p in all_quoatation:
        text = text.replace(p+'s', '')
        text = text.replace('s'+ p, '')
    return text

def replace_short_version(text):
    for p in all_quoatation:
        text = text.replace(p+'re', ' are')
        text = text.replace(p+'ve', ' have')
        text = text.replace(p+'ll', ' will')
        text = text.replace(p+'m', ' am')
        text = text.replace(p+'d', ' would')
        text = text.replace('n'+p+'t', ' not')
    return text    



import inflect
import re 

def replace_numbers_with_words(text):
    regex = r'\b\d+\b'

    matched = re.finditer(regex, text)

    for m in matched:
        number = m.group()
        p = inflect.engine()
        text = text.replace(number, p.number_to_words(number))
    return text

In [52]:
stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in word_tokenize(text.lower()) if word not in stop_words]

def preprocess_title(df):
    # remove punctuation and other stuff
    df['title'] = df['title'].apply(replace_numbers_with_words)
    df['title'] = df['title'].apply(remove_punct)
    df['title'] = df['title'].apply(remove_possesive_s)
    df['title'] = df['title'].apply(replace_short_version)

    # tokenize
    df['title'] = df['title'].apply(tokenize)
    return df

In [53]:
from tqdm.notebook import tqdm
import time
# VECTOR_SIZE = 2500
# WINDOW = 4
EPOCHS = 500
WORKERS = 10
MIN_COUNT = 1
# train many word2vec models with diferent VECTOR_SIZE and WINDOW
VECTOR_SIZEs = [100, 250, 500, 1000, 1500, 2000, 2500, 3000]
WINDOWs = [3, 4, 5, 6, 7, 8]
SGs = [0, 1]

print('Start training')
# sleep 200 ms
time.sleep(0.2)

for VECTOR_SIZE in tqdm(VECTOR_SIZEs):
    print(f'Current VECTOR_SIZE: {VECTOR_SIZE}')
    for WINDOW in tqdm(WINDOWs, desc=f'WINDOW'):
        for sg in tqdm(SGs, desc=f'SG'):
            model = Word2Vec(df['title'], vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS, sg=sg)
            model.train(df['title'], total_examples=len(df['title']), epochs=EPOCHS)
            model.save(f'word2vec_models/word2vec_vs{VECTOR_SIZE}_win{WINDOW}_sg{sg}.model')

Start training


  0%|          | 0/8 [00:00<?, ?it/s]

Current VECTOR_SIZE: 100


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 250


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 500


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 1000


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 1500


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 2000


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

Current VECTOR_SIZE: 2500


WINDOW:   0%|          | 0/6 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]

SG:   0%|          | 0/2 [00:00<?, ?it/s]