In [1]:
import plotly.express as px
import pandas as pd

from collections import defaultdict

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
alldf = pd.read_csv("../data/prep/clean_ruddit.csv")
raw_X, y = list(alldf["clean_text"].str.split().values), list(alldf["score"].values)

In [4]:
tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda doc: doc,
    preprocessor=lambda doc: doc,
    token_pattern=None
)
X = tfidf.fit_transform(raw_X)

In [5]:
A = X.todense()
print(A.shape)
df = pd.DataFrame(A)
df["y"] = y

(5708, 11416)


In [6]:
# conteo de unigramas
unigram_count = defaultdict(int)
for text in alldf["clean_text"]:
    toks = text.split()
    for token in toks:
        unigram_count[token] +=1

top20_unigrams = pd.DataFrame(sorted(unigram_count.items(), key=lambda x: x[1])[::-1], columns=["word", "count"])[:20]
top20_unigrams.head()

Unnamed: 0,word,count
0,i,5748
1,not,4027
2,people,1064
3,like,915
4,think,727


In [None]:
fig = px.bar(
    top20_unigrams.sort_values(by="count"),
    x="count",
    y="word",
    orientation="h",
    color=["#0a7347"] * top20_unigrams.shape[0],
    color_discrete_map="identity"
)
fig.update_layout(title="Top 20 unigramas con mayor ocurrencia")
fig.show("svg")

In [7]:
top20_vocab = {word: tfidf.vocabulary_[word] for word in top20_unigrams["word"]}
top20_X = df[[i for i in top20_vocab.values()] + ["y"]]
top20_X = top20_X.rename(columns={top20_vocab[word]: word for word in top20_vocab})

In [10]:
top20_X.corr().loc[:, ["y",]][:10].T

Unnamed: 0,i,not,people,like,think,!,want,know,time,thing
y,-0.033363,0.066331,0.084843,0.052157,0.012892,-0.018407,0.069977,0.029491,-0.01766,0.001997


In [11]:
top20_X.corr().loc[:, ["y",]][10:].T

Unnamed: 0,good,use,woman,work,need,right,way,man,fuck,mean,y
y,-0.061653,0.017541,0.071359,0.011259,0.017207,0.030708,-0.010798,0.036538,0.429701,0.012508,1.0


In [12]:
from scipy.stats import pearsonr

_, pvalue = pearsonr(top20_X["fuck"], top20_X["y"])

print(pvalue) # menor al 5% - estadísticamente significante - rechazar hipótesis nula

2.9238345091576956e-255
