# Laboratory work #3 (text vectorization)

In [1]:
import re
import os
from collections import defaultdict, Counter
import string

from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


import matplotlib.pyplot as plt
from pandas.errors import EmptyDataError
import numpy as np
from math import log1p
import gensim

[nltk_data] Downloading package punkt to /Users/aleksei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def read_files(root_dir, n=None):
    file_paths = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        try:
            d = pd.read_csv(file_path, sep='\t', header=None)
            d.columns = ['Token', 'Stem', 'Lemma']
        except EmptyDataError as e:
            print(i, file_path, e)
        data.append(d.dropna())
    return data

In [6]:
train = read_files('../assets/annotated-corpus/train', 
                   1000
                   )
# val = read_files('../assets/annotated-corpus/val', 
#                 #  100
#                  )
# test = read_files('../assets/annotated-corpus/test', 
#                 #   100
#                   )

In [7]:
train[0].head()

Unnamed: 0,Token,Stem,Lemma
0,WASHINGTON,washington,WASHINGTON
1,Reuters,reuter,Reuters
2,-,-,-
3,The,the,The
4,U,u,U


In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [81]:
def is_valid_token(token, token_frequencies, min_frequency=2):
    if token in string.punctuation:
        return False
    if token.lower() in stop_words:
        return False
    if token_frequencies[token] < min_frequency:
        return False
    return True

def get_freqs(dfs):
    token_frequencies = Counter()
    term_document_matrix = defaultdict(lambda: defaultdict(int))

    for doc_id, df in enumerate(dfs):
        tokens = df['Token'].tolist()
        token_frequencies.update(tokens)

        for token in tokens:
            if is_valid_token(token, token_frequencies):
                term_document_matrix[doc_id][token] += 1
                
    # filter all tokens that return is_valid_token False
    token_frequencies = Counter(dict({(token, freq) for (token, freq) in token_frequencies.items() if is_valid_token(token, token_frequencies)}))

    for doc_id, terms in term_document_matrix.items():
        term_document_matrix[doc_id] = {token: freq for token, freq in terms.items() if is_valid_token(token, token_frequencies)}

    return token_frequencies, term_document_matrix

In [82]:
token_frequencies, term_document_matrix = get_freqs(train)
token_frequencies.most_common(20)

[('said', 4498),
 ('Trump', 2440),
 ('U', 1938),
 ('would', 1439),
 ('Reuters', 1368),
 ('President', 882),
 ('government', 843),
 ('Republican', 751),
 ('also', 705),
 ('House', 697),
 ('United', 685),
 ('people', 658),
 ('told', 646),
 ('could', 644),
 ('state', 598),
 ('States', 587),
 ('percent', 543),
 ('year', 534),
 ('two', 520),
 ('last', 516)]

In [83]:
term_document_matrix[0]

{'Reuters': 3,
 'U': 3,
 'State': 3,
 'Department': 2,
 'certified': 2,
 'government': 3,
 'corruption': 3,
 'rights': 2,
 'Honduras': 8,
 'receive': 2,
 'millions': 2,
 'dollars': 2,
 'aid': 2,
 'document': 2,
 'seen': 3,
 'showed': 2,
 'Monday': 2,
 'election': 3,
 'violent': 2,
 'winner': 2,
 'week': 2,
 'certification': 2,
 'congressional': 2,
 'President': 4,
 'Trump': 2,
 'administration': 2,
 'taking': 3,
 'one': 3,
 'officials': 2,
 'requirements': 2,
 'Congress': 2,
 'governments': 2,
 'including': 2,
 'former': 4,
 'opposition': 2,
 'Salvador': 2,
 'victory': 2,
 'Hernandez': 4,
 'Nasralla': 3}

In [84]:
data_dir = Path('../assets/data/')
data_dir.mkdir(parents=True, exist_ok=True)
with open(data_dir / 'token_frequencies.tsv', 'w', encoding='utf-8') as file:
    for token, freq in token_frequencies.items():
        if is_valid_token(token, token_frequencies):
            file.write(f'{token}\t{freq}\n')

with open(data_dir / 'term_document_matrix.tsv', 'w', encoding='utf-8') as file:
    for doc_id, terms in term_document_matrix.items():
        for token, freq in terms.items():
            file.write(f'{doc_id}\t{token}\t{freq}\n')

In [85]:
def preprocess_text(text):
    def split_into_sentences(text):
        # so the website will not split into two separate sentences by comma:
        sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
        sentences = sentence_endings.split(text)
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        return sentences
    
    def split_into_words(sentences):
        # regular expression to match complex URLs, simple URLs, hashtags, Twitter handles, and words
        word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|-?\w+\'?\w*')
        tokenized_sentences = []
        for sentence in sentences:
            words = word_pattern.findall(sentence)
            tokenized_sentences.append(words)
        return tokenized_sentences

    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [86]:
def compute_tf(sentence_tokens, token):
    return sentence_tokens.count(token) / len(sentence_tokens)


def compute_idf(token, term_document_matrix, total_documents):
    doc_count = sum(1 for doc in term_document_matrix if token in term_document_matrix[doc])
    return log1p(total_documents / (1 + doc_count))


def process_text_and_create_matrices(text, token_frequencies, term_document_matrix):
    tokenized_sentences = preprocess_text(text)
    total_documents = len(term_document_matrix)
    vocabulary = sorted(token_frequencies.keys())

    max_sentence_length = max(len(sentence) for sentence in tokenized_sentences)

    frequency_matrix = []
    tfidf_matrix = []

    for sentence in tokenized_sentences:
        sentence_freq_vector = [0] * max_sentence_length
        sentence_tfidf_vector = [0] * max_sentence_length

        for i, token in enumerate(sentence):
            if token in vocabulary:
                tf = compute_tf(sentence, token)
                idf = compute_idf(token, term_document_matrix, total_documents)

                sentence_freq_vector[i] = tf
                sentence_tfidf_vector[i] = tf * idf

        frequency_matrix.append(sentence_freq_vector)
        tfidf_matrix.append(sentence_tfidf_vector)

    frequency_matrix = np.array(frequency_matrix)
    tfidf_matrix = np.array(tfidf_matrix)

    document_vector_freq = np.mean(frequency_matrix, axis=0)
    document_vector_tfidf = np.mean(tfidf_matrix, axis=0)

    return document_vector_freq, document_vector_tfidf

In [87]:
text = 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '
print(text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


In [139]:
document_vector_freq, document_vector_tfidf = process_text_and_create_matrices(text, token_frequencies, term_document_matrix)
document_vector_freq.shape, document_vector_tfidf.shape

((53,), (53,))

In [140]:
document_vector_freq

array([0.        , 0.00925926, 0.04761905, 0.        , 0.00925926,
       0.00628931, 0.        , 0.        , 0.02812718, 0.02812718,
       0.        , 0.01257862, 0.00925926, 0.        , 0.00925926,
       0.00925926, 0.00925926, 0.        , 0.00925926, 0.01554857,
       0.00925926, 0.01554857, 0.01554857, 0.01554857, 0.00925926,
       0.01257862, 0.02183788, 0.        , 0.        , 0.00925926,
       0.00925926, 0.        , 0.02812718, 0.02812718, 0.00925926,
       0.00925926, 0.01257862, 0.00628931, 0.        , 0.        ,
       0.        , 0.        , 0.01257862, 0.01257862, 0.        ,
       0.00628931, 0.        , 0.        , 0.01886792, 0.01886792,
       0.        , 0.        , 0.        ])

In [141]:
document_vector_tfidf

array([0.        , 0.03336265, 0.29602886, 0.        , 0.03304673,
       0.02064435, 0.        , 0.        , 0.08169652, 0.08886944,
       0.        , 0.04489367, 0.00876609, 0.        , 0.04185926,
       0.01586091, 0.02994782, 0.        , 0.04106275, 0.03311441,
       0.02228737, 0.03041956, 0.04741797, 0.04646397, 0.02155723,
       0.02993596, 0.0652484 , 0.        , 0.        , 0.04910468,
       0.03596311, 0.        , 0.06131343, 0.07238644, 0.01099263,
       0.01077756, 0.04489367, 0.03475128, 0.        , 0.        ,
       0.        , 0.        , 0.02993596, 0.02928529, 0.        ,
       0.03125044, 0.        , 0.        , 0.05196293, 0.04286184,
       0.        , 0.        , 0.        ])

In [92]:
train_texts = [[token for token in ds['Token'].to_list() if is_valid_token(token, token_frequencies)] for ds in train]

In [93]:
train_texts[0][:10]

['WASHINGTON',
 'Reuters',
 'U',
 'State',
 'Department',
 'certified',
 'Honduran',
 'government',
 'fighting',
 'corruption']

In [152]:
model = gensim.models.Word2Vec(sentences=train_texts, vector_size=30, window=5, min_count=5, workers=4)

In [153]:
Path('../models/').mkdir(parents=True, exist_ok=True)
model_path = '../models/word2vec.model'
model.save(model_path)

In [154]:
token_frequencies

Counter({'said': 4498,
         'Trump': 2440,
         'U': 1938,
         'would': 1439,
         'Reuters': 1368,
         'President': 882,
         'government': 843,
         'Republican': 751,
         'also': 705,
         'House': 697,
         'United': 685,
         'people': 658,
         'told': 646,
         'could': 644,
         'state': 598,
         'States': 587,
         'percent': 543,
         'year': 534,
         'two': 520,
         'last': 516,
         'one': 513,
         'election': 498,
         'former': 486,
         'Donald': 485,
         'president': 455,
         'campaign': 443,
         'new': 430,
         'China': 429,
         'Clinton': 428,
         'Obama': 426,
         'country': 419,
         'tax': 409,
         'Senate': 406,
         'White': 404,
         'years': 392,
         'military': 387,
         'including': 377,
         'officials': 372,
         'Thursday': 369,
         'presidential': 368,
         'Tuesday': 366,
        

In [155]:
print('Word:', token_frequencies['Monday'])
print('Close:', token_frequencies['Tuesday'], token_frequencies['Wednesday'], token_frequencies['Thursday'])
print('Same area', token_frequencies['weekend'], token_frequencies['day'], token_frequencies['week'])
print('Other semantic', token_frequencies['funds'], token_frequencies['town'], token_frequencies['territory'])

Word: 351
Close: 366 340 369
Same area 32 136 355
Other semantic 54 54 51


In [156]:
print('Word:', token_frequencies['north'])
print('Close:', token_frequencies['south'], token_frequencies['west'], token_frequencies['east'])
print('Same area', token_frequencies['world'], token_frequencies['side'], token_frequencies['direction'])
print('Other semantic', token_frequencies['party'], token_frequencies['senator'], token_frequencies['husband'])

Word: 30
Close: 25 13 11
Same area 197 59 12
Other semantic 357 51 46


In [157]:
print('Word:', token_frequencies['Spain'])
print('Close:', token_frequencies['Madrid'], token_frequencies['Catalonia'], token_frequencies['Europe'])
print('Same area', token_frequencies['Brexit'], token_frequencies['kingdom'], token_frequencies['EU'])
print('Other semantic', token_frequencies['Trump'], token_frequencies['Twitter'], token_frequencies['Korea'])

Word: 53
Close: 19 49 90
Same area 81 12 190
Other semantic 2440 103 282


In [158]:
def cosine_similarity(vec_a, vec_b):
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)


words_to_analyze = ['Monday', 'north', 'Spain']
similar_words = {
    'Monday': ['Tuesday', 'Wednesday', 'Thursday'], 
    'north': ['south', 'west', 'east'],
    'Spain': ['Madrid', 'Catalonia', 'Europe']
}

related_words = {
    'Monday': ['weekend', 'day', 'week'], 
    'north': ['world', 'side', 'direction'],
    'Spain': ['Brexit', 'kingdom', 'EU']
}

unrelated_words = {
    'Monday': ['funds', 'town', 'territory'], 
    'north': ['party', 'senator', 'husband'],
    'Spain': ['Trump', 'Twitter', 'Korea']
}

for word in words_to_analyze:
    word_vec = model.wv[word]
    print(f'Cosine distances for "{word}":')
    for group, words in [('Similar', similar_words), ('Related', related_words), ('Unrelated', unrelated_words)]:
        distances = {target_word: cosine_similarity(word_vec, model.wv[target_word]) for target_word in words[word]}
        print(f'\t{group}: {distances}')

Cosine distances for "Monday":
	Similar: {'Tuesday': 0.9846577, 'Wednesday': 0.9913203, 'Thursday': 0.9923759}
	Related: {'weekend': 0.8830235, 'day': 0.9369939, 'week': 0.75852627}
	Unrelated: {'funds': 0.7935365, 'town': 0.8225489, 'territory': 0.81213}
Cosine distances for "north":
	Similar: {'south': 0.9954677, 'west': 0.9939249, 'east': 0.9942235}
	Related: {'world': 0.975929, 'side': 0.98762476, 'direction': 0.9896573}
	Unrelated: {'party': 0.8913339, 'senator': 0.85640633, 'husband': 0.93527836}
Cosine distances for "Spain":
	Similar: {'Madrid': 0.9910841, 'Catalonia': 0.9976725, 'Europe': 0.9921143}
	Related: {'Brexit': 0.99399817, 'kingdom': 0.984493, 'EU': 0.9867774}
	Unrelated: {'Trump': 0.60371745, 'Twitter': 0.9568, 'Korea': 0.51787007}
