# Laboratory work #3 (text vectorization)

In [1]:
import re
import os
from collections import defaultdict, Counter
import string

from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


import matplotlib.pyplot as plt
from pandas.errors import EmptyDataError
import numpy as np
from math import log1p
import gensim

[nltk_data] Downloading package punkt to /Users/aleksei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_files(root_dir, n=None):
    file_paths = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        try:
            d = pd.read_csv(file_path, sep='\t', header=None)
            d.columns = ['Token', 'Stem', 'Lemma']
        except EmptyDataError as e:
            print(i, file_path, e)
        data.append(d.dropna())
        
    
    ids = [os.path.splitext(os.path.basename(path))[0] for path in file_paths]
    return ids, data

In [4]:
train_ids, train = read_files('../assets/annotated-corpus/train', 
                   1000
                   )
# val_ids, val = read_files('../assets/annotated-corpus/val', 
#                 #  100
#                  )
test_ids, test = read_files('../assets/annotated-corpus/test', 
                #   100
                  )

2209 ../assets/annotated-corpus/test/fake/1382.tsv No columns to parse from file


In [5]:
train[0].head()

Unnamed: 0,Token,Stem,Lemma
0,WASHINGTON,washington,WASHINGTON
1,Reuters,reuter,Reuters
2,-,-,-
3,The,the,The
4,U,u,U


In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def is_valid_token(token, token_frequencies, min_frequency=2):
    if token in string.punctuation:
        return False
    if token.lower() in stop_words:
        return False
    if token_frequencies[token] < min_frequency:
        return False
    return True


def get_freqs(dfs):
    token_frequencies = Counter()
    term_document_matrix = defaultdict(lambda: defaultdict(int))

    for doc_id, df in enumerate(dfs):
        tokens = df['Token'].tolist()
        token_frequencies.update(tokens)

        for token in tokens:
            if is_valid_token(token, token_frequencies):
                term_document_matrix[doc_id][token] += 1
                
    # filter all tokens that return is_valid_token False
    token_frequencies = Counter(dict({(token, freq) for (token, freq) in token_frequencies.items() if is_valid_token(token, token_frequencies)}))

    for doc_id, terms in term_document_matrix.items():
        term_document_matrix[doc_id] = {token: freq for token, freq in terms.items() if is_valid_token(token, token_frequencies)}

    return token_frequencies, term_document_matrix

In [8]:
token_frequencies, term_document_matrix = get_freqs(train)
token_frequencies.most_common(20)

[('said', 4498),
 ('Trump', 2440),
 ('U', 1938),
 ('would', 1439),
 ('Reuters', 1368),
 ('President', 882),
 ('government', 843),
 ('Republican', 751),
 ('also', 705),
 ('House', 697),
 ('United', 685),
 ('people', 658),
 ('told', 646),
 ('could', 644),
 ('state', 598),
 ('States', 587),
 ('percent', 543),
 ('year', 534),
 ('two', 520),
 ('last', 516)]

In [9]:
term_document_matrix[0]

{'Reuters': 3,
 'U': 3,
 'State': 3,
 'Department': 2,
 'certified': 2,
 'government': 3,
 'corruption': 3,
 'rights': 2,
 'Honduras': 8,
 'receive': 2,
 'millions': 2,
 'dollars': 2,
 'aid': 2,
 'document': 2,
 'seen': 3,
 'showed': 2,
 'Monday': 2,
 'election': 3,
 'violent': 2,
 'winner': 2,
 'week': 2,
 'certification': 2,
 'congressional': 2,
 'President': 4,
 'Trump': 2,
 'administration': 2,
 'taking': 3,
 'one': 3,
 'officials': 2,
 'requirements': 2,
 'Congress': 2,
 'governments': 2,
 'including': 2,
 'former': 4,
 'opposition': 2,
 'Salvador': 2,
 'victory': 2,
 'Hernandez': 4,
 'Nasralla': 3}

In [9]:
data_dir = Path('../assets/data/')
data_dir.mkdir(parents=True, exist_ok=True)
with open(data_dir / 'token_frequencies.tsv', 'w', encoding='utf-8') as file:
    for token, freq in token_frequencies.items():
        if is_valid_token(token, token_frequencies):
            file.write(f'{token}\t{freq}\n')

with open(data_dir / 'term_document_matrix.tsv', 'w', encoding='utf-8') as file:
    for doc_id, terms in term_document_matrix.items():
        for token, freq in terms.items():
            file.write(f'{doc_id}\t{token}\t{freq}\n')

In [10]:
data_dir = Path('../assets/data/')

token_frequencies = {}
with open(data_dir / 'token_frequencies.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        token, freq = line.strip().split('\t')
        token_frequencies[token] = int(freq)

term_document_matrix = {}
with open(data_dir / 'term_document_matrix.tsv', 'r', encoding='utf-8') as file:
    for line in file:
        doc_id, token, freq = line.strip().split('\t')
        doc_id = int(doc_id)
        freq = int(freq)
        if doc_id not in term_document_matrix:
            term_document_matrix[doc_id] = {}
        term_document_matrix[doc_id][token] = freq

In [10]:
term_document_matrix[0]

{'Reuters': 3,
 'U': 3,
 'State': 3,
 'Department': 2,
 'certified': 2,
 'government': 3,
 'corruption': 3,
 'rights': 2,
 'Honduras': 8,
 'receive': 2,
 'millions': 2,
 'dollars': 2,
 'aid': 2,
 'document': 2,
 'seen': 3,
 'showed': 2,
 'Monday': 2,
 'election': 3,
 'violent': 2,
 'winner': 2,
 'week': 2,
 'certification': 2,
 'congressional': 2,
 'President': 4,
 'Trump': 2,
 'administration': 2,
 'taking': 3,
 'one': 3,
 'officials': 2,
 'requirements': 2,
 'Congress': 2,
 'governments': 2,
 'including': 2,
 'former': 4,
 'opposition': 2,
 'Salvador': 2,
 'victory': 2,
 'Hernandez': 4,
 'Nasralla': 3}

In [11]:
def get_term_document_vector(token, term_document_matrix):
    vector = []
    for k, v in term_document_matrix.items():
        freq = v.get(token, 0)
        vector.append(freq)
    return vector

In [12]:
get_term_document_vector('Reuters', term_document_matrix)[:5]

[3, 1, 1, 1, 1]

In [13]:
get_term_document_vector('cat', term_document_matrix)[:5]

[0, 0, 0, 0, 0]

In [14]:
def preprocess_text(text):
    def split_into_sentences(text):
        # so the website will not split into two separate sentences by comma:
        sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
        sentences = sentence_endings.split(text)
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        return sentences
    
    def split_into_words(sentences):
        # regular expression to match complex URLs, simple URLs, hashtags, Twitter handles, and words
        word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|-?\w+\'?\w*')
        tokenized_sentences = []
        for sentence in sentences:
            words = word_pattern.findall(sentence)
            tokenized_sentences.append(words)
        return tokenized_sentences

    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [15]:
def compute_tf(sentence_tokens, token):
    return sentence_tokens.count(token) / len(sentence_tokens)


def compute_idf(token, term_document_matrix, total_documents):
    doc_count = sum(1 for doc in term_document_matrix if token in term_document_matrix[doc])
    return log1p(total_documents / (1 + doc_count))


def process_text_and_create_matrices(text, token_frequencies, term_document_matrix):
    tokenized_sentences = preprocess_text(text)
    total_documents = len(term_document_matrix)
    vocabulary = sorted(token_frequencies.keys())

    max_sentence_length = max(len(sentence) for sentence in tokenized_sentences)

    frequency_matrix = []
    tfidf_matrix = []

    for sentence in tokenized_sentences:
        sentence_freq_vector = [0] * max_sentence_length
        sentence_tfidf_vector = [0] * max_sentence_length

        for i, token in enumerate(sentence):
            if token in vocabulary:
                tf = compute_tf(sentence, token)
                idf = compute_idf(token, term_document_matrix, total_documents)

                sentence_freq_vector[i] = tf
                sentence_tfidf_vector[i] = tf * idf

        frequency_matrix.append(sentence_freq_vector)
        tfidf_matrix.append(sentence_tfidf_vector)

    frequency_matrix = np.array(frequency_matrix)
    tfidf_matrix = np.array(tfidf_matrix)

    document_vector_freq = np.mean(frequency_matrix, axis=0)
    document_vector_tfidf = np.mean(tfidf_matrix, axis=0)

    return document_vector_freq, document_vector_tfidf

In [16]:
text = 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '
print(text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


In [17]:
document_vector_freq, document_vector_tfidf = process_text_and_create_matrices(text, token_frequencies, term_document_matrix)
document_vector_freq.shape, document_vector_tfidf.shape

((53,), (53,))

In [18]:
document_vector_freq

array([0.        , 0.00925926, 0.04761905, 0.        , 0.00925926,
       0.00628931, 0.        , 0.        , 0.02812718, 0.02812718,
       0.        , 0.01257862, 0.00925926, 0.        , 0.00925926,
       0.00925926, 0.00925926, 0.        , 0.00925926, 0.01554857,
       0.00925926, 0.01554857, 0.01554857, 0.01554857, 0.00925926,
       0.01257862, 0.02183788, 0.        , 0.        , 0.00925926,
       0.00925926, 0.        , 0.02812718, 0.02812718, 0.00925926,
       0.00925926, 0.01257862, 0.00628931, 0.        , 0.        ,
       0.        , 0.        , 0.01257862, 0.01257862, 0.        ,
       0.00628931, 0.        , 0.        , 0.01886792, 0.01886792,
       0.        , 0.        , 0.        ])

In [19]:
document_vector_tfidf

array([0.        , 0.03336265, 0.29602886, 0.        , 0.03304673,
       0.02064435, 0.        , 0.        , 0.08169652, 0.08886944,
       0.        , 0.04489367, 0.00876609, 0.        , 0.04185926,
       0.01586091, 0.02994782, 0.        , 0.04106275, 0.03311441,
       0.02228737, 0.03041956, 0.04741797, 0.04646397, 0.02155723,
       0.02993596, 0.0652484 , 0.        , 0.        , 0.04910468,
       0.03596311, 0.        , 0.06131343, 0.07238644, 0.01099263,
       0.01077756, 0.04489367, 0.03475128, 0.        , 0.        ,
       0.        , 0.        , 0.02993596, 0.02928529, 0.        ,
       0.03125044, 0.        , 0.        , 0.05196293, 0.04286184,
       0.        , 0.        , 0.        ])

In [20]:
train_texts = [[token for token in ds['Token'].to_list() if token in token_frequencies.keys() and is_valid_token(token, token_frequencies)] for ds in train]

In [21]:
train_texts[0][:10]

['WASHINGTON',
 'Reuters',
 'U',
 'State',
 'Department',
 'certified',
 'Honduran',
 'government',
 'fighting',
 'corruption']

In [22]:
model = gensim.models.Word2Vec(sentences=train_texts, vector_size=30, window=5, min_count=2, workers=4)

In [23]:
Path('../models/').mkdir(parents=True, exist_ok=True)
model_path = '../models/word2vec.model'
model.save(model_path)

In [24]:
print('Word:', token_frequencies['Monday'])
print('Close:', token_frequencies['Tuesday'], token_frequencies['Wednesday'], token_frequencies['Thursday'])
print('Same area', token_frequencies['weekend'], token_frequencies['day'], token_frequencies['week'])
print('Other semantic', token_frequencies['funds'], token_frequencies['town'], token_frequencies['territory'])

Word: 351
Close: 366 340 369
Same area 32 136 355
Other semantic 54 54 51


In [25]:
print('Word:', token_frequencies['north'])
print('Close:', token_frequencies['south'], token_frequencies['west'], token_frequencies['east'])
print('Same area', token_frequencies['world'], token_frequencies['side'], token_frequencies['direction'])
print('Other semantic', token_frequencies['party'], token_frequencies['senator'], token_frequencies['husband'])

Word: 30
Close: 25 13 11
Same area 197 59 12
Other semantic 357 51 46


In [26]:
print('Word:', token_frequencies['Spain'])
print('Close:', token_frequencies['Madrid'], token_frequencies['Catalonia'], token_frequencies['Europe'])
print('Same area', token_frequencies['Brexit'], token_frequencies['kingdom'], token_frequencies['EU'])
print('Other semantic', token_frequencies['Trump'], token_frequencies['Twitter'], token_frequencies['Korea'])

Word: 53
Close: 19 49 90
Same area 81 12 190
Other semantic 2440 103 282


In [27]:
def cosine_similarity(vec_a, vec_b):
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)


words_to_analyze = ['Monday', 'north', 'Spain']
similar_words = {
    'Monday': ['Tuesday', 'Wednesday', 'Thursday'], 
    'north': ['south', 'west', 'east'],
    'Spain': ['Madrid', 'Catalonia', 'Europe']
}

related_words = {
    'Monday': ['weekend', 'day', 'week'], 
    'north': ['world', 'side', 'direction'],
    'Spain': ['Brexit', 'kingdom', 'EU']
}

unrelated_words = {
    'Monday': ['funds', 'town', 'territory'], 
    'north': ['party', 'senator', 'husband'],
    'Spain': ['Trump', 'Twitter', 'Korea']
}

for word in words_to_analyze:
    word_vec = model.wv[word]
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, model.wv[target_word]) for target_word in words[word]}
        print(f'\t{group}: {distances}')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.98521554, 'Wednesday': 0.9956391, 'Thursday': 0.99442405}
	Related: {'weekend': 0.9199952, 'day': 0.9706025, 'week': 0.8846852}
	Unrelated: {'funds': 0.8974621, 'town': 0.9121232, 'territory': 0.9063075}
Cosine distances for "north":
	Similar: {'south': 0.9950781, 'west': 0.9908334, 'east': 0.9901164}
	Related: {'world': 0.9888181, 'side': 0.9933237, 'direction': 0.9749533}
	Unrelated: {'party': 0.96269864, 'senator': 0.9135714, 'husband': 0.9536501}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.9944941, 'Catalonia': 0.99844, 'Europe': 0.9946334}
	Related: {'Brexit': 0.99536973, 'kingdom': 0.9904338, 'EU': 0.98713017}
	Unrelated: {'Trump': 0.7496816, 'Twitter': 0.9833913, 'Korea': 0.5986631}


In [28]:
for word in words_to_analyze:
    word_vec = get_term_document_vector(word, term_document_matrix)
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, get_term_document_vector(target_word, term_document_matrix)) for target_word in words[word]}
        print(f'\t{group}: {distances}')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.24013497172980114, 'Wednesday': 0.09500680105219181, 'Thursday': 0.12213416677724975}
	Related: {'weekend': 0.20891954827216885, 'day': 0.20346547637930487, 'week': 0.29776708853401584}
	Unrelated: {'funds': 0.10768513954141816, 'town': 0.1068187511177002, 'territory': 0.09110917257960921}
Cosine distances for "north":
	Similar: {'south': 0.24262373970773643, 'west': 0.04075695729696112, 'east': 0.14467284665112365}
	Related: {'world': 0.022649412708342614, 'side': 0.04933303124557308, 'direction': 0.13794014696151088}
	Unrelated: {'party': 0.0571824055100569, 'senator': 0.0, 'husband': 0.0}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.6707700464776317, 'Catalonia': 0.8567311328463756, 'Europe': 0.09002191295904333}
	Related: {'Brexit': 0.0, 'kingdom': 0.0, 'EU': 0.09194621406122841}
	Unrelated: {'Trump': 0.0019796542475018055, 'Twitter': 0.0, 'Korea': 0.0}


In [29]:
term_document_df = np.zeros((len(token_frequencies), len(term_document_matrix)))

In [30]:
term_document_df.shape

(13011, 1000)

In [31]:
for i, term in enumerate(token_frequencies.keys()):
    if i % 1000 == 0:
        print(i)
    term_document_df[i, :] = np.array(get_term_document_vector(term, term_document_matrix), dtype=np.float16)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [32]:
# term_document_df_ = term_document_df[:5000, :]

In [33]:
from sklearn.decomposition import PCA


n_components = 30

pca = PCA(n_components=n_components)
reduced_tfidf_vectors = pca.fit_transform(term_document_df)
reduced_tfidf_vectors.shape

(13011, 30)

In [34]:
np.save('../assets/reduced_tfidf_vectors.npy', reduced_tfidf_vectors)

In [35]:
reduced_tfidf_vectors = pd.DataFrame.from_records(reduced_tfidf_vectors)

In [37]:
reduced_tfidf_vectors.index = list(token_frequencies.keys())

In [38]:
reduced_tfidf_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
cremation,-0.540339,0.035638,0.010455,-0.022096,-0.032935,0.03401,-0.002652,-0.023045,0.039716,-0.0173,...,0.146327,-0.020718,0.114045,-0.042284,-0.115863,-0.080562,0.086859,-0.021938,-0.055345,-0.150202
Neue,-0.544954,0.030545,-0.013619,-0.037163,-0.073567,0.012588,-0.030445,-0.00217,-0.073183,-0.046952,...,-0.084547,-0.10278,-0.073078,0.156571,0.011845,-0.014279,-0.044121,0.122493,0.055656,0.065843
precise,-0.432102,0.074185,-0.01365,-0.039179,-0.124273,-0.045746,-0.127163,-0.201595,-0.080385,-0.145168,...,0.198775,0.161539,0.009426,-0.088758,0.130933,-0.167473,0.098699,-0.08639,-0.023247,-0.138215
weather,-0.309921,-0.144929,-0.022938,-0.008633,-0.030558,0.052401,-0.186585,0.165663,0.002704,0.133029,...,-0.036045,0.052101,0.18194,0.026502,-0.119375,-0.097576,0.050044,0.142145,-0.136805,-0.06983
struggling,-0.199181,-0.119069,0.102443,0.049092,0.149471,0.020211,-0.056222,0.140698,-0.107173,-0.066946,...,-0.061467,-0.208787,0.604421,-0.092057,0.029561,-0.100921,-0.229308,-0.069991,0.157938,0.237455


In [39]:
for word in words_to_analyze:
    try:
        word_vec = reduced_tfidf_vectors.loc[word]
        print(f'Cosine distances for "{word}":')
        for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
            distances = {target_word: cosine_similarity(word_vec, reduced_tfidf_vectors.loc[target_word]) for target_word in words[word]}
            print(f'\t{group}: {distances}')
    except:
        print('no words')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.7412937971336343, 'Wednesday': 0.742750490343868, 'Thursday': 0.7709688174526034}
	Related: {'weekend': 0.447151308208404, 'day': 0.6517535306402553, 'week': 0.730666778204977}
	Unrelated: {'funds': 0.3396216055992591, 'town': 0.5495935383936134, 'territory': 0.32590710734520684}
Cosine distances for "north":
	Similar: {'south': 0.7228655236836695, 'west': 0.36476536750590405, 'east': 0.44368036203785716}
	Related: {'world': 0.15237492504146236, 'side': 0.506183825414011, 'direction': -0.07216604762507903}
	Unrelated: {'party': 0.03762348843509121, 'senator': -0.06554823839538687, 'husband': -0.08733275792774}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.9092780714144346, 'Catalonia': 0.9903170930767478, 'Europe': 0.3029619019984378}
	Related: {'Brexit': 0.21044026434925178, 'kingdom': 0.022636910888994183, 'EU': 0.2812210798714555}
	Unrelated: {'Trump': -0.07953689601364596, 'Twitter': -0.14423247694889688, 'Korea': -0.035

In [40]:
def vectorize_with_w2v(text, model):
    tokenized_sentences = preprocess_text(text)
    sentence_vectors = []
    
    for sentence in tokenized_sentences:
        word_vectors = []
        
        for word in sentence:
            if word in model.wv.key_to_index:
                word_vector = model.wv[word]
                word_vectors.append(word_vector)
                
        if word_vectors:
                sentence_vector = np.mean(word_vectors, axis=0)
                sentence_vectors.append(sentence_vector)

    if sentence_vectors:
        document_vector = np.mean(sentence_vectors, axis=0)
        return document_vector
    else:
        return np.zeros(model.vector_size)


In [41]:
text = ' '.join(train_texts[0])
print(text)
print(vectorize_with_w2v(text, model).shape)

WASHINGTON Reuters U State Department certified Honduran government fighting corruption supporting human rights clearing way Honduras receive millions dollars U aid document seen Reuters showed document dated Nov 28 seen Reuters Monday showed Secretary State Rex Tillerson certified Honduras assistance two days controversial presidential election claimed ally Washington Honduras faced violent protests disputed results election still produced clear winner week vote ended decision issue certification prompted concern congressional Democrats Republican President Donald Trump administration could seen taking sides kind message send one congressional aide asked State Department officials immediate response questioned timing certification Honduras required fulfill dozen requirements order receive share million U Congress program assist Central American governments Among requirements combating corruption including investigating prosecuting current former government officials alleged corrupt pr

In [42]:
test_texts = [[token for token in ds['Token'].to_list() if is_valid_token(token, token_frequencies)] for ds in test]

In [43]:
test_vectors = [vectorize_with_w2v(' '.join(text), model) for text in test_texts]

In [44]:
test_vectors[0].shape

(30,)

In [285]:
with open('../assets/annotated-corpus/test-embeddings.tsv', 'w') as file:
    for doc_id, vector in zip(test_ids, test_vectors):
        vector_str = '\t'.join(map(str, vector))
        file.write(f'{doc_id}\t{vector_str}\n')