In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer)

In [2]:
df = pd.read_csv('../data/tokenized_extrovert.csv', engine='pyarrow')
df.head(10)

Unnamed: 0,author_id,label,tokens
0,t2_2hrxxs28,0,"['question', ',', 'doctor', ',', 'how', ""'d"", ..."
1,t2_2hrxxs28,0,"['butt', 'covid', '+', 'cycle', '.', 'i', ""'m""..."
2,t2_2hrxxs28,0,"['different', 'doctors', '.', 'situation', 'su..."
3,t2_4pxpgwz,0,"['thought', 'pebbleyeet', 'guy', 'autistic', '..."
4,t2_4pxpgwz,0,"['…', 'i', 'always', 'end', 'voting', 'wrong',..."
5,t2_4pxpgwz,0,"['made', 'feel', 'lot', 'better', '.', 'ooh', ..."
6,t2_4pxpgwz,0,"['mouth', ',', 'you', '’d', 'panic', 'attack',..."
7,t2_4pxpgwz,0,"['did', 'nt', 'read', 'top', 'half', 'bc', 'cr..."
8,t2_4pxpgwz,0,"['hot', '?', 'ca', 'n’t', 'much', ',', 'either..."
9,t2_4pxpgwz,0,"['otherwise', ',', 'though', ',', '“', 'needin..."


We will of course keep numbers for the model, but having only text in this notebook makes it easy to visualize

In [3]:
remove_int = False

def remove_integers(tokens):
    if any(char.isdigit() for char in tokens):
        return ''
    else:
        return tokens
if remove_int:
    df['tokenized'] = df['tokenized'].apply(remove_integers)
    df = df[df['tokenized'] != '']



In [21]:
df_1 = df[:1]
df_10 = df[:10]
df_100 = df[:100]
df_1000 = df[:1000]

In [22]:
tokens_1 = df_1['tokenized'].to_numpy()
tokens_10 = df_10['tokenized'].to_numpy()
tokens_100 = df_100['tokenized'].to_numpy()
tokens_1000 = df_1000['tokenized'].to_numpy()


## Vectorization
Mostly from: [the holy bible](https://neptune.ai/blog/vectorization-techniques-in-nlp-guide)

### Vectorized representation of first 10 rows using _bag-of-words_

In [23]:
cv = CountVectorizer(ngram_range=(2,2))
x = cv.fit_transform(tokens_10)
print(x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]]


In [24]:
print(sorted(cv.vocabulary_.keys())[:10])

['00 extroverted', '000 people', '000 pounds', '10 covid', '10 days', '10 minutes', '10 ugh', '10 worse', '100 done', '100 ems']


### TF*iDF


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [26]:
x = tfidf.fit_transform(tokens_100)
vectors = x.T.todense()
print(sorted(zip(
    tfidf.get_feature_names_out(),
    vectors[0].tolist()),
    key=lambda x: x[1], reverse=False))


[('00', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.043517109952789526, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])]


### Word2Vec
[literature](https://arxiv.org/pdf/1301.3781.pdf)
[cool app](https://ronxin.github.io/wevi/)
[huge thanks to this paper](https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/)

In [27]:
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models.phrases import Phrases, Phraser

In [32]:
sent = [row.split() for row in tokens_1000]

phrases = Phrases(sent, min_count=30, progress_per=10)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [33]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1

df_word_freq = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])
df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)

df_word_freq

Unnamed: 0,Word,Frequency
14,"'.',",71174
1,"',',",62255
98,"'""',",10336
7,"'?',",9762
235,"')',",7197
...,...,...
26544,"'obligate',",1
26543,"'sheerly',",1
26540,"['spontaneity',",1
26539,'swing'],1


In [34]:
cores = os.cpu_count()

w2v_model = Word2Vec(
    sg=1,  # 1 for skip-gram, CBOW otherwise
    
    min_count=20,
    window=2,
    sample=6e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=cores-1
)

### Some EDA on tokenized words 

In [52]:
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=30,
    total_words=len(sentences),
    # report_delay=1
    )

(12504545, 30158460)

In [65]:
for index, word in enumerate(w2v_model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_model.wv.index_to_key)} is {word}")

word #0/5101 is '.',
word #1/5101 is ',',
word #2/5101 is '"',
word #3/5101 is '?',
word #4/5101 is ')',
word #5/5101 is 'like',
word #6/5101 is '-',
word #7/5101 is '...',
word #8/5101 is '(',
word #9/5101 is 'people',


In [61]:
similar_words = w2v_model.wv.most_similar(positive=["like"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")


KeyError: "Key 'like' not present in vocabulary"