In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer)

In [2]:
df = pd.read_csv('../data/tokenized_extrovert.csv', engine='pyarrow')
df.head(10)

Unnamed: 0,author_id,label,tokens
0,t2_2hrxxs28,0,"['question', ',', 'doctor', ',', 'how', ""'d"", ..."
1,t2_2hrxxs28,0,"['butt', 'covid', '+', 'cycle', '.', 'i', ""'m""..."
2,t2_2hrxxs28,0,"['different', 'doctors', '.', 'situation', 'su..."
3,t2_4pxpgwz,0,"['thought', 'pebbleyeet', 'guy', 'autistic', '..."
4,t2_4pxpgwz,0,"['…', 'i', 'always', 'end', 'voting', 'wrong',..."
5,t2_4pxpgwz,0,"['made', 'feel', 'lot', 'better', '.', 'ooh', ..."
6,t2_4pxpgwz,0,"['mouth', ',', 'you', '’d', 'panic', 'attack',..."
7,t2_4pxpgwz,0,"['did', 'nt', 'read', 'top', 'half', 'bc', 'cr..."
8,t2_4pxpgwz,0,"['hot', '?', 'ca', 'n’t', 'much', ',', 'either..."
9,t2_4pxpgwz,0,"['otherwise', ',', 'though', ',', '“', 'needin..."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40452 entries, 0 to 40451
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author_id  40452 non-null  object
 1   label      40452 non-null  int64 
 2   tokens     40452 non-null  object
dtypes: int64(1), object(2)
memory usage: 948.2+ KB


We will of course keep numbers for the model, but having only text in this notebook makes it easy to visualize

In [3]:
remove_int = False

def remove_integers(tokens):
    if any(char.isdigit() for char in tokens):
        return ''
    else:
        return tokens
if remove_int:
    df['tokens'] = df['tokens'].apply(remove_integers)
    df = df[df['tokens'] != '']



In [4]:
df_1 = df[:1]
df_10 = df[:10]
df_100 = df[:100]
df_1000 = df[:1000]

In [6]:
tokens_1 = df_1['tokens'].to_numpy()
tokens_10 = df_10['tokens'].to_numpy()
tokens_100 = df_100['tokens'].to_numpy()
tokens_1000 = df_1000['tokens'].to_numpy()


## Vectorization
Mostly from: [the holy bible](https://neptune.ai/blog/vectorization-techniques-in-nlp-guide)

### Vectorized representation of first 10 rows using _bag-of-words_

In [7]:
cv = CountVectorizer(ngram_range=(2,2))
x = cv.fit_transform(tokens_10)
print(x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]]


In [8]:
print(sorted(cv.vocabulary_.keys())[:10])

['00 extroverted', '000 people', '000 pounds', '10 covid', '10 days', '10 minutes', '10 ugh', '10 worse', '100 done', '100 ems']


### TF*iDF


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import dask.dataframe as dd
import dask.array as da
from dask_ml.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

ImportError: cannot import name 'TfidfVectorizer' from 'dask_ml.feature_extraction.text' (C:\Users\atabekis\AppData\Local\Programs\Python\Python311\Lib\site-packages\dask_ml\feature_extraction\text.py)

In [28]:
df_cleaned = pd.read_csv('../data/cleaned_extrovert.csv', engine='pyarrow')
ddf = dd.from_pandas(df_cleaned, npartitions=1000)
ddf.head(10)

Unnamed: 0,author_id,post,label
0,t2_2hrxxs28,"question, doctor, how'd get painkillers? otc p...",0
1,t2_2hrxxs28,butt covid + cycle. i'm sure what's going i've...,0
2,t2_2hrxxs28,different doctors. situation sucks relate peop...,0
3,t2_4pxpgwz,thought pebbleyeet guy autistic guy wants “fix...,0
4,t2_4pxpgwz,…i always end voting wrong even crewmate. hour...,0
5,t2_4pxpgwz,made feel lot better. ooh yikes half comments ...,0
6,t2_4pxpgwz,"mouth, you’d panic attack whenever tried eat n...",0
7,t2_4pxpgwz,"didnt read top half bc cropped off, thank much...",0
8,t2_4pxpgwz,"hot? can’t much, either strip nude rip skin gu...",0
9,t2_4pxpgwz,"otherwise, though, “needing” masturbation thin...",0


In [29]:
tfidf_matrix = tfidf.fit_transform(ddf['post'])
tfidf_array = da.compute(tfidf_matrix.toarray())

MemoryError: Unable to allocate 154. GiB for an array with shape (40452, 512176) and data type float64

In [None]:
tfidf_array

### Word2Vec
[literature](https://arxiv.org/pdf/1301.3781.pdf)
[cool app](https://ronxin.github.io/wevi/)
[huge thanks to this paper](https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/)

In [None]:
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models.phrases import Phrases, Phraser

In [None]:
sent = [row.split() for row in tokens_1000]

phrases = Phrases(sent, min_count=30, progress_per=10)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1

df_word_freq = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])
df_word_freq = df_word_freq.sort_values(by='Frequency', ascending=False)

df_word_freq

In [None]:
cores = os.cpu_count()

w2v_model = Word2Vec(
    sg=1,  # 1 for skip-gram, CBOW otherwise
    
    min_count=20,
    window=2,
    sample=6e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=cores-1
)

### Some EDA on tokenized words 

In [None]:
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=30,
    total_words=len(sentences),
    # report_delay=1
    )

In [None]:
for index, word in enumerate(w2v_model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_model.wv.index_to_key)} is {word}")

In [None]:
similar_words = w2v_model.wv.most_similar(positive=["like"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")
