# 3. Word Embeddings

## 3.0 Setup and loading data

In [93]:
import time
import nltk
import pickle
import gensim
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
from gensim import models
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.options.mode.chained_assignment = None
plt.style.use('seaborn-whitegrid')

In [3]:
df = pd.read_csv('dataframe_raw.temp.csv').drop(columns = ['Unnamed: 0'])
df['headline'] = df['headline'].str.replace('[^a-zA-Z ]', '')
df['headline'] = df['headline'].str.lower()
df = df.fillna(' ')

## 3.1 Embedding Headline Words

### 3.1.1 Constructing Model

In [5]:
tokenized_headlines = np.asarray([sentence.split() for sentence in df.headline])

headlines_model = gensim.models.Word2Vec(
    tokenized_headlines, size = 300, window = 4
)

In [7]:
headlines_word_to_vector = dict(
    zip(headlines_model.wv.index2word, headlines_model.wv.vectors)
)

headlines_vector_to_word = dict(
    zip([tuple(word) for word in headlines_model.wv.vectors.tolist()],
        headlines_model.wv.index2word)
)

### 3.1.2 Intuitive Testing

In [26]:
similar_words = [('trump', 'obama'), ('cat', 'dog'), ('peace', 'war')]
dissimilar_words = [('trump', 'dog'), ('cat', 'war'), ('peace', 'obama')]

print('Similar words:\nword 1\t\tword 2\t\tsimilarity')
for word_a, word__b in similar_words:
    print('%r\t\t%r\t\t%.2f' % (word_a, word__b, headlines_model.wv.similarity(word_a, word__b)))

print('\nDissimilar words:\nword 1\t\tword 2\t\tsimilarity')
for word_a, word__b in dissimilar_words:
    print('%r\t\t%r\t\t%.2f' % (word_a, word__b, headlines_model.wv.similarity(word_a, word__b)))

Similar words:
word 1		word 2		similarity
'trump'		'obama'		0.78
'cat'		'dog'		0.88
'peace'		'war'		0.58

Dissimilar words:
word 1		word 2		similarity
'trump'		'dog'		0.25
'cat'		'war'		-0.04
'peace'		'obama'		0.39


In [29]:
random_words_list = ['trump', 'cat', 'war', 'question']
for word in random_words_list:
    print([tup[0] for tup in headlines_model.wv.most_similar(positive=[word], topn=10)])

['trumps', 'obama', 'trumpus', 'glover', 'ted', 'congress', 'sanders', 'gop', 'marco', 'roy']
['toddler', 'dog', 'puppy', 'grandma', 'robot', 'baby', 'butt', 'pants', 'twins', 'selfie']
['iraq', 'racism', 'terrorism', 'freedom', 'movement', 'democracy', 'rights', 'humanity', 'leadership', 'politics']
['lie', 'word', 'tweet', 'answer', 'joke', 'truth', 'theory', 'thing', 'thinking', 'lies']


In [34]:
similar_words_except_one = [
    ['data', 'science', 'maths', 'statistics', 'head'],
    ['obama', 'trump', 'hillary', 'sanders', 'food'],
    ['cat', 'dog', 'pet', 'eat']
]

for word_list in similar_words_except_one:
    print(headlines_model.wv.doesnt_match(word_list), '\tdoes not fit in\t', word_list)

head 	does not fit in	 ['data', 'science', 'maths', 'statistics', 'head']
food 	does not fit in	 ['obama', 'trump', 'hillary', 'sanders', 'food']
eat 	does not fit in	 ['cat', 'dog', 'pet', 'eat']


## 3.2 Calculating TF-IDF

In [137]:
def calculate_tf_idf():
    
    grouped_by_category = df.groupby('category').headline.apply(list)
    categories = grouped_by_category.index.tolist()
    grouped = [' '.join(category) for category in grouped_by_category]

    all_words_set = set(' '.join(df['headline']).split(' '))
    all_words_list = list(all_words_set)
    all_words_dict = dict(zip(all_words_list, list(range(len(all_words_list)))))
    
    word_count_per_category_dict = {}
    for category in categories:
        word_count_per_category_dict[category] = np.zeros(len(all_words_list))
        for tup in Counter(grouped[categories.index(category)].split(' ')).most_common():
            word_count_per_category_dict[category][all_words_dict[tup[0]]] += tup[1]
            
    word_count_per_category_df = pd.DataFrame.from_dict(word_count_per_category_dict)
    
    category_word_count = {}
    for category in categories:
        category_word_count[category] = len(
            ' '.join(df[df['category'] == category]['headline']).split(' ')
        )

    tf = {}
    for cat in categories:
        tf[cat] = [
            w / category_word_count[cat] for w in word_count_per_category_dict[cat]
        ]
    
    idf = []
    for index, row in word_count_per_category_df.iterrows():
        idf.append(np.log(len(categories) / np.count_nonzero(row)))
        
    tfidf = {}
    for category in categories:
        tfidf[category] = [tf * idf for tf, idf in zip(tf[category], idf)]

    return tfidf, all_words_list

In [138]:
headlines_tf_idf, headlines_word_list = calculate_tf_idf()

## 3.3 Headlines embedding

In [140]:
def wordToVector(word, model):
    if word in model.wv.index2word:
        return model.wv[word]

def vectorToWord(vector, model, dictionary):
    if vector in [tuple(word) for word in model.wv.vectors.tolist()]:
        return dictionary[tuple(vector)]

In [141]:
vectorized_headlines = [
    [wordToVector(w, headlines_model) for w in h] for h in tokenized_headlines
]

### 3.3.1 Not weighing words

In [142]:
def get_mean_vector(model, tokens):

    vectorized = [[wordToVector(word, model) for word in t] for t in tokens]

    vectors_means = [
        np.mean([v for v in t if v is not None], axis=0) for t in vectorized
    ]
    
    return vectors_means

In [None]:
headline_vectors_means = get_mean_vector(headlines_model, tokenized_headlines)

### 3.3.2 Weighing words

In [146]:
def weigh_words_with_tf_idf(vectorized_words, tfidf, all_words_list, tokenized_texts):
    all_words_dict = dict(zip(all_words_list, list(range(len(all_words_list)))))
    weighted = list(vectorized_words)
    for text in range(df.shape[0]):
        category = df.loc[text].category
        for word in range(len(weighted[text])):
            word_index = all_words_dict[tokenized_texts[text][word]]
            if type(weighted[text][word]) == np.ndarray:
                weighted[text][word] = weighted[text][word] * tfidf[category][word_index]

    return weighted

In [147]:
vectorized_headlines_weighted = weigh_words_with_tf_idf(
    list(vectorized_headlines), headlines_tf_idf, headlines_word_list, tokenized_headlines
)

In [148]:
headline_vectors_means_weighted = [
    np.mean([
        vector for vector in headline if vector is not None
    ], axis=0) for headline in vectorized_headlines_weighted
]

## 3.4 Formatting data

### 3.4.1 Handling null values

In [150]:
categories_mean = df.category.tolist()

In [151]:
def clean(text_vectors_means):
    means = [vector.tolist() for vector in text_vectors_means]
    text_nans = np.where([vector != vector for vector in means])[0]
    means_clean = np.delete(np.asarray(means), text_nans)
    categories_clean = np.delete(np.asarray(categories_mean), text_nans)
    return means_clean, categories_clean

In [152]:
headlines_mean_clean, headlines_categories_clean = clean(headline_vectors_means)

headlines_weighted_mean_clean, headlines_w_categories_clean = clean(
    headline_vectors_means_weighted
)

### 3.4.2 Separating predictor and response

In [153]:
def split(name, X_Y):
    X_df = pd.DataFrame({name + '_vectors' : X_Y[0]})
    X_Y_df = pd.DataFrame(X_df[name + '_vectors'].values.tolist())
    X_Y_df['categories'] = X_Y[1]
    X = X_Y_df.drop(columns = ['categories'], inplace = False)
    Y = X_Y_df.categories
    
    return X, Y

In [154]:
headlines_X, headlines_Y = split(
    'headlines', (headlines_mean_clean, headlines_categories_clean)
)

headlines_weighted_X, headlines_weighted_Y = split(
    'headlines_weighted', (headlines_weighted_mean_clean, headlines_w_categories_clean)
)

### 3.4.3 Separating train and test sets

In [155]:
X_train, X_test, Y_train, Y_test = train_test_split(
    headlines_X, headlines_Y, test_size = 0.2, random_state = 0
)

X_train_weighted, X_test_weighted, Y_train_weighted, Y_test_weighted = train_test_split(
    headlines_weighted_X, headlines_weighted_Y, test_size = 0.2, random_state = 0
)

### 3.4.4 Saving train and test sets

In [156]:
all_data_sets = [
    ['X_train', X_train], ['X_test', X_test],
    ['Y_train', Y_train], ['Y_test', Y_test],
    ['X_train_weighted', X_train_weighted], ['X_test_weighted', X_test_weighted],
    ['Y_train_weighted', Y_train_weighted], ['Y_test_weighted', Y_test_weighted],
]

In [157]:
for data_set in all_data_sets:
    data_set[1].to_csv('train_and_test_sets/' + data_set[0] +  '.csv', index=False)