# Laboratory work #3 (text vectorization)

In [1]:
import re
import os
from collections import defaultdict, Counter
import string

from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


import matplotlib.pyplot as plt
from pandas.errors import EmptyDataError
import numpy as np
from math import log1p
import gensim

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_files(root_dir, n=None):
    file_paths = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        try:
            d = pd.read_csv(file_path, sep='\t', header=None)
            d.columns = ['Token', 'Stem', 'Lemma']
        except EmptyDataError as e:
            print(i, file_path, e)
        data.append(d.dropna())
        
    
    ids = [os.path.splitext(os.path.basename(path))[0] for path in file_paths]
    return ids, data

In [3]:
train_ids, train = read_files('../assets/annotated-corpus/train', 
                  #  1000
                   )
# val_ids, val = read_files('../assets/annotated-corpus/val', 
#                 #  100
#                  )
test_ids, test = read_files('../assets/annotated-corpus/test', 
                #   100
                  )

9520 ../assets/annotated-corpus/train/fake/21080.tsv No columns to parse from file
12465 ../assets/annotated-corpus/train/fake/31165.tsv No columns to parse from file
2445 ../assets/annotated-corpus/test/true/22518.tsv No columns to parse from file


In [4]:
train[0].head()

Unnamed: 0,Token,Stem,Lemma
0,USA,usa,USA
1,Today,today,Today
2,published,publish,published
3,an,an,an
4,article,articl,article


In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
def is_valid_token(token, token_frequencies, min_frequency=2):
    if token in string.punctuation:
        return False
    if token.lower() in stop_words:
        return False
    if token_frequencies[token] < min_frequency:
        return False
    return True


def get_freqs(dfs):
    token_frequencies = Counter()
    term_document_matrix = defaultdict(lambda: defaultdict(int))

    for doc_id, df in enumerate(dfs):
        tokens = df['Token'].tolist()
        token_frequencies.update(tokens)

        for token in tokens:
            if is_valid_token(token, token_frequencies):
                term_document_matrix[doc_id][token] += 1
                
    # filter all tokens that return is_valid_token False
    token_frequencies = Counter(dict({(token, freq) for (token, freq) in token_frequencies.items() if is_valid_token(token, token_frequencies)}))

    for doc_id, terms in term_document_matrix.items():
        term_document_matrix[doc_id] = {token: freq for token, freq in terms.items() if is_valid_token(token, token_frequencies)}

    return token_frequencies, term_document_matrix

In [7]:
token_frequencies, term_document_matrix = get_freqs(train)
token_frequencies.most_common(20)

[('said', 98241),
 ('Trump', 96486),
 ('U', 39632),
 ('would', 39288),
 ('people', 27540),
 ('President', 24731),
 ('Reuters', 23085),
 ('one', 21932),
 ('also', 21062),
 ('Donald', 20538),
 ('Republican', 19150),
 ('government', 18758),
 ('House', 18317),
 ('Clinton', 18097),
 ('Obama', 18029),
 ('could', 16798),
 ('told', 16567),
 ('United', 16544),
 ('campaign', 15198),
 ('state', 15043)]

In [8]:
term_document_matrix[0]

{'USA': 15,
 'Today': 13,
 'published': 2,
 'article': 4,
 'today': 2,
 'every': 4,
 'hotel': 2,
 'publication': 2,
 'guests': 2,
 'read': 2,
 'free': 3,
 'one': 3,
 'anti': 2,
 '-Trump': 2,
 'find': 2,
 'like': 2,
 'morning': 2,
 'whose': 2,
 'president': 2,
 'family': 9,
 'almost': 4,
 'news': 3,
 'across': 2,
 'American': 5,
 'without': 2,
 'situation': 2,
 'President': 6,
 'Trump': 7,
 'unprecedented': 2,
 'number': 3,
 'White': 3,
 'House': 3,
 'protectees': 2,
 'Secret': 13,
 'Service': 13,
 'Director': 5,
 'Randolph': 4,
 'Tex': 4,
 'Alles': 7,
 'statement': 3,
 'said': 5,
 'pay': 5,
 'hundreds': 2,
 'agents': 6,
 'needs': 3,
 'mission': 3,
 'large': 2,
 'due': 2,
 'funding': 2,
 'meet': 2,
 'current': 2,
 'requirements': 4,
 'year': 5,
 'employees': 4,
 'overtime': 4,
 'statutory': 3,
 'caps': 4,
 'TODAY': 2,
 '1': 2,
 '000': 3,
 'already': 5,
 'salary': 2,
 'entire': 3,
 '2016': 2,
 'Barack': 3,
 'Obama': 4,
 'work': 4,
 'hours': 3,
 'calendar': 2,
 '2017': 2,
 'agency': 4,
 '

In [9]:
data_dir = Path('../assets/data/')
data_dir.mkdir(parents=True, exist_ok=True)
with open(data_dir / 'token_frequencies.tsv', 'w', encoding='utf-8') as file:
    for token, freq in token_frequencies.items():
        if is_valid_token(token, token_frequencies):
            file.write(f'{token}\t{freq}\n')

with open(data_dir / 'term_document_matrix.tsv', 'w', encoding='utf-8') as file:
    for doc_id, terms in term_document_matrix.items():
        for token, freq in terms.items():
            file.write(f'{doc_id}\t{token}\t{freq}\n')

In [10]:
data_dir = Path('../assets/data/')

token_frequencies = {}
with open(data_dir / 'token_frequencies.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        token, freq = line.strip().split('\t')
        token_frequencies[token] = int(freq)

term_document_matrix = {}
with open(data_dir / 'term_document_matrix.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        doc_id, token, freq = line.strip().split('\t')
        doc_id = int(doc_id)
        freq = int(freq)
        if doc_id not in term_document_matrix:
            term_document_matrix[doc_id] = {}
        term_document_matrix[doc_id][token] = freq

In [11]:
term_document_matrix[0]

{'USA': 15,
 'Today': 13,
 'published': 2,
 'article': 4,
 'today': 2,
 'every': 4,
 'hotel': 2,
 'publication': 2,
 'guests': 2,
 'read': 2,
 'free': 3,
 'one': 3,
 'anti': 2,
 '-Trump': 2,
 'find': 2,
 'like': 2,
 'morning': 2,
 'whose': 2,
 'president': 2,
 'family': 9,
 'almost': 4,
 'news': 3,
 'across': 2,
 'American': 5,
 'without': 2,
 'situation': 2,
 'President': 6,
 'Trump': 7,
 'unprecedented': 2,
 'number': 3,
 'White': 3,
 'House': 3,
 'protectees': 2,
 'Secret': 13,
 'Service': 13,
 'Director': 5,
 'Randolph': 4,
 'Tex': 4,
 'Alles': 7,
 'statement': 3,
 'said': 5,
 'pay': 5,
 'hundreds': 2,
 'agents': 6,
 'needs': 3,
 'mission': 3,
 'large': 2,
 'due': 2,
 'funding': 2,
 'meet': 2,
 'current': 2,
 'requirements': 4,
 'year': 5,
 'employees': 4,
 'overtime': 4,
 'statutory': 3,
 'caps': 4,
 'TODAY': 2,
 '1': 2,
 '000': 3,
 'already': 5,
 'salary': 2,
 'entire': 3,
 '2016': 2,
 'Barack': 3,
 'Obama': 4,
 'work': 4,
 'hours': 3,
 'calendar': 2,
 '2017': 2,
 'agency': 4,
 '

In [12]:
def get_term_document_vector(token, term_document_matrix):
    vector = []
    for k, v in term_document_matrix.items():
        freq = v.get(token, 0)
        vector.append(freq)
    return vector

In [13]:
get_term_document_vector('Reuters', term_document_matrix)[:5]

[0, 0, 0, 0, 0]

In [14]:
get_term_document_vector('cat', term_document_matrix)[:5]

[0, 0, 0, 0, 0]

In [15]:
def preprocess_text(text):
    def split_into_sentences(text):
        # so the website will not split into two separate sentences by comma:
        sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
        sentences = sentence_endings.split(text)
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        return sentences
    
    def split_into_words(sentences):
        # regular expression to match complex URLs, simple URLs, hashtags, Twitter handles, and words
        word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|-?\w+\'?\w*')
        tokenized_sentences = []
        for sentence in sentences:
            words = word_pattern.findall(sentence)
            tokenized_sentences.append(words)
        return tokenized_sentences

    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [16]:
def compute_tf(sentence_tokens, token):
    return sentence_tokens.count(token) / len(sentence_tokens)


def compute_idf(token, term_document_matrix, total_documents):
    doc_count = sum(1 for doc in term_document_matrix if token in term_document_matrix[doc])
    return log1p(total_documents / (1 + doc_count))


def process_text_and_create_matrices(text, token_frequencies, term_document_matrix):
    tokenized_sentences = preprocess_text(text)
    total_documents = len(term_document_matrix)
    vocabulary = sorted(token_frequencies.keys())

    max_sentence_length = max(len(sentence) for sentence in tokenized_sentences)

    frequency_matrix = []
    tfidf_matrix = []

    for sentence in tokenized_sentences:
        sentence_freq_vector = [0] * max_sentence_length
        sentence_tfidf_vector = [0] * max_sentence_length

        for i, token in enumerate(sentence):
            if token in vocabulary:
                tf = compute_tf(sentence, token)
                idf = compute_idf(token, term_document_matrix, total_documents)

                sentence_freq_vector[i] = tf
                sentence_tfidf_vector[i] = tf * idf

        frequency_matrix.append(sentence_freq_vector)
        tfidf_matrix.append(sentence_tfidf_vector)

    frequency_matrix = np.array(frequency_matrix)
    tfidf_matrix = np.array(tfidf_matrix)

    document_vector_freq = np.mean(frequency_matrix, axis=0)
    document_vector_tfidf = np.mean(tfidf_matrix, axis=0)

    return document_vector_freq, document_vector_tfidf

In [17]:
text = 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '
print(text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


In [18]:
document_vector_freq, document_vector_tfidf = process_text_and_create_matrices(text, token_frequencies, term_document_matrix)
document_vector_freq.shape, document_vector_tfidf.shape

((53,), (53,))

In [19]:
document_vector_freq

array([0.05390836, 0.01554857, 0.04761905, 0.        , 0.05687831,
       0.01554857, 0.        , 0.        , 0.02812718, 0.02812718,
       0.00628931, 0.01257862, 0.00925926, 0.        , 0.01554857,
       0.01554857, 0.00925926, 0.        , 0.00925926, 0.01554857,
       0.00925926, 0.01554857, 0.01554857, 0.01554857, 0.00925926,
       0.01257862, 0.02183788, 0.        , 0.00628931, 0.00925926,
       0.00925926, 0.        , 0.02812718, 0.02812718, 0.01554857,
       0.00925926, 0.01257862, 0.00628931, 0.00628931, 0.        ,
       0.        , 0.        , 0.01257862, 0.01257862, 0.00628931,
       0.00628931, 0.00628931, 0.        , 0.01886792, 0.01886792,
       0.00628931, 0.00628931, 0.00628931])

In [20]:
document_vector_tfidf

array([0.51415463, 0.07121161, 0.27656899, 0.        , 0.34779873,
       0.07254365, 0.        , 0.        , 0.08302616, 0.09588796,
       0.0290603 , 0.04229549, 0.01115623, 0.        , 0.07728029,
       0.05543456, 0.02773684, 0.        , 0.0420934 , 0.03519483,
       0.02184315, 0.03266809, 0.04616814, 0.03875599, 0.01840298,
       0.02506917, 0.06191121, 0.        , 0.04842098, 0.05248307,
       0.04153007, 0.        , 0.06205771, 0.07746551, 0.0615242 ,
       0.01006755, 0.04229549, 0.03267674, 0.05374827, 0.        ,
       0.        , 0.        , 0.02506917, 0.02500028, 0.04017597,
       0.03639457, 0.05810708, 0.        , 0.05138533, 0.04699861,
       0.05810708, 0.03293664, 0.03336833])

In [21]:
train_texts = [[token for token in ds['Token'].to_list() if token in token_frequencies.keys() and is_valid_token(token, token_frequencies)] for ds in train]

In [22]:
train_texts[0][:10]

['USA',
 'Today',
 'published',
 'article',
 'today',
 'egregiously',
 'misleading',
 'every',
 'hotel',
 'leaves']

In [23]:
model = gensim.models.Word2Vec(sentences=train_texts, vector_size=30, window=5, min_count=2, workers=4)

In [24]:
Path('../models/').mkdir(parents=True, exist_ok=True)
model_path = '../models/word2vec.model'
model.save(model_path)

In [25]:
print('Word:', token_frequencies['Monday'])
print('Close:', token_frequencies['Tuesday'], token_frequencies['Wednesday'], token_frequencies['Thursday'])
print('Same area', token_frequencies['weekend'], token_frequencies['day'], token_frequencies['week'])
print('Other semantic', token_frequencies['funds'], token_frequencies['town'], token_frequencies['territory'])

Word: 6980
Close: 7964 7500 7296
Same area 1193 5844 9020
Other semantic 1550 1575 994


In [26]:
print('Word:', token_frequencies['north'])
print('Close:', token_frequencies['south'], token_frequencies['west'], token_frequencies['east'])
print('Same area', token_frequencies['world'], token_frequencies['side'], token_frequencies['direction'])
print('Other semantic', token_frequencies['party'], token_frequencies['senator'], token_frequencies['husband'])

Word: 615
Close: 569 377 419
Same area 6777 2008 559
Other semantic 9548 1384 1331


In [27]:
print('Word:', token_frequencies['Spain'])
print('Close:', token_frequencies['Madrid'], token_frequencies['Catalonia'], token_frequencies['Europe'])
print('Same area', token_frequencies['Brexit'], token_frequencies['kingdom'], token_frequencies['EU'])
print('Other semantic', token_frequencies['Trump'], token_frequencies['Twitter'], token_frequencies['Korea'])

Word: 955
Close: 437 771 2270
Same area 1492 291 4043
Other semantic 96486 5347 6240


In [28]:
def cosine_similarity(vec_a, vec_b):
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)


words_to_analyze = ['Monday', 'north', 'Spain']
similar_words = {
    'Monday': ['Tuesday', 'Wednesday', 'Thursday'], 
    'north': ['south', 'west', 'east'],
    'Spain': ['Madrid', 'Catalonia', 'Europe']
}

related_words = {
    'Monday': ['weekend', 'day', 'week'], 
    'north': ['world', 'side', 'direction'],
    'Spain': ['Brexit', 'kingdom', 'EU']
}

unrelated_words = {
    'Monday': ['funds', 'town', 'territory'], 
    'north': ['party', 'senator', 'husband'],
    'Spain': ['Trump', 'Twitter', 'Korea']
}

for word in words_to_analyze:
    word_vec = model.wv[word]
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, model.wv[target_word]) for target_word in words[word]}
        print(f'\t{group}: {distances}')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.9737339, 'Wednesday': 0.9858618, 'Thursday': 0.982888}
	Related: {'weekend': 0.56576025, 'day': 0.46691594, 'week': 0.5616831}
	Unrelated: {'funds': 0.049275674, 'town': 0.33820704, 'territory': -0.00015503431}
Cosine distances for "north":
	Similar: {'south': 0.94738543, 'west': 0.95018405, 'east': 0.9475872}
	Related: {'world': 0.2151862, 'side': 0.24820313, 'direction': -0.025277833}
	Unrelated: {'party': 0.033860423, 'senator': -0.18111715, 'husband': -0.1852833}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.8731296, 'Catalonia': 0.9340388, 'Europe': 0.5712906}
	Related: {'Brexit': 0.5921776, 'kingdom': 0.27127314, 'EU': 0.59468395}
	Unrelated: {'Trump': -0.14517818, 'Twitter': -0.21532726, 'Korea': 0.03403758}


In [29]:
for word in words_to_analyze:
    word_vec = get_term_document_vector(word, term_document_matrix)
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, get_term_document_vector(target_word, term_document_matrix)) for target_word in words[word]}
        print(f'\t{group}: {distances}')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.21058447533038976, 'Wednesday': 0.09644222814763143, 'Thursday': 0.0839318871089539}
	Related: {'weekend': 0.13976112455496112, 'day': 0.1461953071514957, 'week': 0.23562990612492185}
	Unrelated: {'funds': 0.05497001853953496, 'town': 0.05789731268175128, 'territory': 0.07635427079184337}
Cosine distances for "north":
	Similar: {'south': 0.22760009183170773, 'west': 0.17512123312849956, 'east': 0.10182192622997722}
	Related: {'world': 0.03645647388968974, 'side': 0.07851380774439148, 'direction': 0.03168753817050964}
	Unrelated: {'party': 0.030857041733275928, 'senator': 0.005178081383343333, 'husband': 0.014101017404050889}
Cosine distances for "Spain":


	Similar: {'Madrid': 0.6871109854186562, 'Catalonia': 0.7871924013529985, 'Europe': 0.05470590470976215}
	Related: {'Brexit': 0.004884680873595118, 'kingdom': 0.0031832043942822408, 'EU': 0.07276382992608514}
	Unrelated: {'Trump': 0.004025144580344135, 'Twitter': 0.01138766491923137, 'Korea': 0.004247065514542696}


In [30]:
term_document_df = np.zeros((len(token_frequencies), len(term_document_matrix)))

In [31]:
term_document_df.shape

(78759, 30870)

In [32]:
for i, term in enumerate(token_frequencies.keys()):
    if i % 1000 == 0:
        print(i)
    term_document_df[i, :] = np.array(get_term_document_vector(term, term_document_matrix), dtype=np.float16)

0


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000


In [33]:
# term_document_df_ = term_document_df[:5000, :]

In [34]:
from sklearn.decomposition import PCA


n_components = 30

pca = PCA(n_components=n_components)
reduced_tfidf_vectors = pca.fit_transform(term_document_df)
reduced_tfidf_vectors.shape

(78759, 30)

In [35]:
np.save('../assets/reduced_tfidf_vectors.npy', reduced_tfidf_vectors)

In [36]:
reduced_tfidf_vectors = pd.DataFrame.from_records(reduced_tfidf_vectors)

In [37]:
reduced_tfidf_vectors.index = list(token_frequencies.keys())

In [38]:
reduced_tfidf_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
bleach,-0.530313,0.193183,0.133454,0.080183,0.06521,-0.00483,0.048303,0.029062,-0.03411,0.022189,...,0.012084,0.005808,-0.000485,-0.010206,0.007498,-0.008831,0.018373,-0.012391,-0.011868,-0.016482
councilor,-0.524775,0.043473,0.194606,0.103258,0.041672,-0.026342,0.0284,-0.006103,-0.045053,-0.035502,...,-0.003385,-0.04653,-0.053155,0.044384,-0.013902,0.012952,0.087045,0.063328,-0.031316,0.022185
beachgoers,-0.596811,0.135316,0.182437,0.107843,0.032547,0.001504,0.013508,0.022382,-0.028621,0.017486,...,-0.013817,-0.000462,-0.011339,-0.003229,0.003759,-0.003257,0.000793,0.006532,-0.015685,0.003926
complexities,-0.513003,0.047696,0.218139,0.108336,0.030107,0.007141,-0.049194,-0.0174,-0.051327,0.002307,...,-0.031288,0.028371,0.025444,0.07311,-0.023945,-0.053308,0.017267,-0.001586,0.023437,-0.036362
immaculate,-0.597218,0.136648,0.183605,0.108875,0.031585,0.006128,0.009378,0.02954,-0.027701,0.018066,...,-0.016245,0.000813,-0.019077,-0.009499,-0.002077,-0.016662,0.00105,0.014233,-0.020286,-0.001692


In [39]:
for word in words_to_analyze:
    try:
        word_vec = reduced_tfidf_vectors.loc[word]
        print(f'Cosine distances for "{word}":')
        for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
            distances = {target_word: cosine_similarity(word_vec, reduced_tfidf_vectors.loc[target_word]) for target_word in words[word]}
            print(f'\t{group}: {distances}')
    except:
        print('no words')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.9463421027556389, 'Wednesday': 0.95330938028016, 'Thursday': 0.9498681940518182}
	Related: {'weekend': 0.7917631504620743, 'day': 0.6689732843296009, 'week': 0.922784135446442}
	Unrelated: {'funds': 0.47365928786747735, 'town': 0.5540664941481236, 'territory': 0.39525033605907406}
Cosine distances for "north":
	Similar: {'south': 0.9521470690765715, 'west': 0.84738123213278, 'east': 0.8190188816827702}
	Related: {'world': 0.2527760210443455, 'side': 0.5479573714605677, 'direction': 0.2820184757323661}
	Unrelated: {'party': 0.17675241508013234, 'senator': 0.0188781305024143, 'husband': 0.1370231708485331}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.9736261074047513, 'Catalonia': 0.9760077401932337, 'Europe': 0.4468716645848769}
	Related: {'Brexit': 0.2684848346240149, 'kingdom': 0.1617980761620471, 'EU': 0.2627454083935922}
	Unrelated: {'Trump': -0.0017539808014249952, 'Twitter': 0.05002026298625143, 'Korea': 0.020863465795

In [40]:
def vectorize_with_w2v(text, model):
    tokenized_sentences = preprocess_text(text)
    sentence_vectors = []
    
    for sentence in tokenized_sentences:
        word_vectors = []
        
        for word in sentence:
            if word in model.wv.key_to_index:
                word_vector = model.wv[word]
                word_vectors.append(word_vector)
                
        if word_vectors:
                sentence_vector = np.mean(word_vectors, axis=0)
                sentence_vectors.append(sentence_vector)

    if sentence_vectors:
        document_vector = np.mean(sentence_vectors, axis=0)
        return document_vector
    else:
        return np.zeros(model.vector_size)

In [41]:
text = ' '.join(train_texts[0])
print(text)
print(vectorize_with_w2v(text, model).shape)

(30,)


In [43]:
test_texts = [[token for token in ds['Token'].to_list() if token in token_frequencies.keys() and is_valid_token(token, token_frequencies)] for ds in test]

In [46]:
test_texts[0][:10]

['One',
 'major',
 'things',
 'Donald',
 'Trump',
 'speech',
 'NATO',
 'summit',
 'shocked',
 'everyone']

In [47]:
test_vectors = [vectorize_with_w2v(' '.join(text), model) for text in test_texts]

In [48]:
test_vectors[0].shape

(30,)

In [49]:
with open('../assets/annotated-corpus/test-embeddings.tsv', 'w') as file:
    for doc_id, vector in zip(test_ids, test_vectors):
        vector_str = '\t'.join(map(str, vector))
        file.write(f'{doc_id}\t{vector_str}\n')