In [19]:
import csv
import numpy as np
import pandas as pd

# Word2vec
from gensim.models import Word2Vec
from gensim.utils import tokenize
from unidecode import unidecode

In [20]:
train_data = pd.read_csv("data/train.csv")

In [21]:
# embedding dimension
n_dim = 256
pca_dim = 8

In [22]:
sentences = [list(tokenize(s, deacc=True, lower=True)) for s in train_data['text']]
print(sentences[:5])

[['rt', 'refarcir', 'macron', 'ans', 'nom', 'prepare'], ['populaire'], ['faut', 'degager', 'cingle'], ['enseignants', 'mettre', 'prescriptions', 'president', 'republique', 'faut', 'croire', 'president', 'donner', 'periode', 'compte', 'mesures', 'appliquees', 'grever'], ['mafieuse', 'oppressive', 'macron']]


In [23]:
model = Word2Vec(vector_size=n_dim, window=20, min_count=1, workers=7)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=5)

(15072681, 16375635)

In [24]:
# len(model.wv.vocab)

In [25]:
embeddings = np.zeros((train_data.shape[0], n_dim))

In [26]:
for idx in range(train_data.shape[0]):
    for s in train_data.loc[idx, "text"].split():
        try:
            embeddings[idx] += model.wv[unidecode(s).lower()]
        except:
            pass
            # print(idx, unidecode(s).lower())

In [27]:
from sklearn.decomposition import PCA

In [28]:
my_pca = PCA(n_components = pca_dim)
vecs_pca = my_pca.fit_transform(embeddings)
print(vecs_pca.shape)

(353969, 10)


In [29]:
df_new = pd.concat([train_data, pd.DataFrame(vecs_pca)], axis=1)

In [30]:
display(df_new)

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,...,0,1,2,3,4,5,6,7,8,9
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],...,-6.433186,-7.046125,-7.159528,-1.653331,-11.077375,-1.229355,0.705162,-0.166519,0.797443,-6.801293
1,populaire,0,0,86,1016,284,[],[],0,[],...,-26.898735,-2.887474,-2.951327,-7.416482,1.023093,6.670186,5.532475,-2.162911,-6.012593,-0.723121
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],...,-19.077771,-0.589066,-10.228227,4.455620,-3.299817,3.510749,2.868523,-1.908120,1.751114,-0.606858
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],...,34.445805,-18.268263,-12.143657,-15.818929,1.101041,-0.312006,-1.004276,-12.849099,20.581952,-3.561096
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],...,-22.729922,-5.165031,-5.190862,1.973742,-10.053071,3.023001,1.149006,-1.654998,-2.808160,-2.617465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353964,gonflette tour raciste frustré,0,0,34,1509,55,[],['https://t.co/pma33zhslx'],0,[],...,-23.200567,9.168013,-1.354596,3.259399,4.829057,5.962724,4.328717,2.454035,-2.711641,-1.759709
353965,france caste crapuleuse encadrée gangsters irr...,0,0,89,11166,127,[],[],0,[],...,-16.762759,-8.291523,-4.836682,2.071300,-1.471003,1.080635,10.394096,-4.117511,-6.895246,-0.970865
353966,eric zemmour français berbère,3,0,1888,712,3086,[],[],0,[],...,-14.355055,5.672827,13.320000,9.826197,-5.599960,-7.178056,23.651970,2.343744,-1.274978,5.171941
353967,gauchistes dépression pq,0,0,139,486,320,[],[],0,[],...,-27.461337,-0.875409,-6.659606,1.953106,5.654577,1.341887,0.404597,0.196824,-1.858019,3.036971


In [31]:
df_new.to_csv("data/train_with_embeddings_pca.csv")

In [32]:
eval_data = pd.read_csv("data/evaluation.csv")

In [33]:
embeddings = np.zeros((eval_data.shape[0], n_dim))

In [34]:
for idx in range(eval_data.shape[0]):
    for s in eval_data.loc[idx, "text"].split():
        try:
            embeddings[idx] += model.wv[unidecode(s).lower()]
        except:
            pass
            # print(idx, unidecode(s).lower())

In [35]:
vecs_pca = my_pca.transform(embeddings)

In [36]:
df_new = pd.concat([eval_data, pd.DataFrame(vecs_pca)], axis=1)

In [37]:
display(df_new)

Unnamed: 0,text,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,...,0,1,2,3,4,5,6,7,8,9
0,descendue pourcentage vote,0,85,4442,327,[],[],0,"['presidentielle2022', 'hidalgo']",1647607994000,...,-25.106810,1.642175,-7.510906,-6.862657,-0.315281,9.106791,4.470821,1.024795,3.523822,-3.820713
1,photo demande arbre généalogique,0,427,33282,443,[],[],0,[],1647601275000,...,-24.642367,-9.445896,0.705949,0.956864,-0.405804,-0.002607,-4.994268,3.059718,3.005137,-2.925789
2,dossiers promis poste gars vraiment fiable,6,1127,13111,1596,[],[],0,['macron'],1646900684000,...,-15.327828,-1.313244,-5.665031,-3.315039,-1.087454,-6.084421,-9.854659,1.723075,6.546258,10.002547
3,lol cas faut second tour macron sortir définit...,2,1699,25760,2036,[],['https://t.co/7ox54d8dlm'],0,[],1647161294000,...,32.895415,38.564670,-12.181399,-9.166162,-9.917518,28.113929,1.505811,18.973443,19.835033,4.191530
4,zemmour putain discours propagandiste littéral...,0,249,20718,369,[],"['https://t.co/tsw9cg6v9j', 'https://t.co/t0j9...",0,[],1647724874000,...,21.163118,6.656253,-7.623919,28.592330,18.526472,-13.873669,-11.636706,8.963390,-2.977745,-10.613889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117985,balais inexcusable vote veut,0,4,194,64,[],[],0,[],1647103416000,...,-18.703672,1.368985,-9.432679,-3.394513,-1.689316,4.595281,5.907068,-0.183161,1.271645,-2.590341
117986,balayant rap borloo mâles blcs commis faute fo...,0,744,39489,894,[],['https://t.co/im5cs5ur1a'],0,[],1647490899000,...,8.424112,-6.251546,-1.385298,-2.229046,10.937421,0.669838,1.689008,-9.370370,8.551069,12.006647
117987,triste réalité covid tue malades macron veille...,12,924,2270,159,[],[],0,[],1647522384000,...,6.137757,-9.768452,-15.948746,11.770413,-6.099256,-4.453856,-0.184408,-2.623170,5.321670,-0.365128
117988,excellente idée pouvez réussir démarche crise ...,0,5,4,80,[],['https://t.co/xontgm06pb'],0,[],1647607262000,...,25.095558,-18.905119,5.670245,-2.468355,5.604290,4.139388,-1.510084,-13.805993,7.571481,6.076165


In [38]:
df_new.to_csv("data/evaluation_with_embeddings_pca.csv")