In [9]:
import csv
import numpy as np
import pandas as pd

# Word2vec
from gensim.models import Word2Vec
from gensim.utils import tokenize
from unidecode import unidecode

In [10]:
train_data = pd.read_csv("data/train.csv")

In [11]:
# embedding dimension
n_dim = 20

In [12]:
sentences = [list(tokenize(s, deacc=True, lower=True)) for s in train_data['text']]
print(sentences[:5])

[['rt', 'refarcir', 'macron', 'ans', 'nom', 'prepare'], ['populaire'], ['faut', 'degager', 'cingle'], ['enseignants', 'mettre', 'prescriptions', 'president', 'republique', 'faut', 'croire', 'president', 'donner', 'periode', 'compte', 'mesures', 'appliquees', 'grever'], ['mafieuse', 'oppressive', 'macron']]


In [13]:
model = Word2Vec(vector_size=n_dim, window=20, min_count=1, workers=7)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=5)

(15074052, 16375635)

In [14]:
# len(model.wv.vocab)

In [15]:
embeddings = np.zeros((train_data.shape[0], n_dim))

In [16]:
for idx in range(train_data.shape[0]):
    for s in train_data.loc[idx, "text"].split():
        try:
            embeddings[idx] += model.wv[unidecode(s).lower()]
        except:
            pass
            # print(idx, unidecode(s).lower())

In [17]:
df_new = pd.concat([train_data, pd.DataFrame(embeddings)], axis=1)

In [18]:
display(df_new)

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,...,10,11,12,13,14,15,16,17,18,19
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],...,8.967005,-3.194563,4.728174,-1.347796,4.691279,-0.118773,9.410282,-1.328891,-1.215403,-11.987825
1,populaire,0,0,86,1016,284,[],[],0,[],...,-0.566212,1.024252,-5.599099,-3.118130,1.292250,-3.637425,-0.661477,-2.173531,1.460193,-5.258020
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],...,6.843688,-2.267870,-1.734141,2.017070,4.783902,5.926421,5.771123,5.315465,-0.347224,-4.215528
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],...,12.983458,-9.208629,-10.319180,3.815842,9.918651,5.279766,30.952323,-16.280253,-3.417956,-20.056804
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],...,3.423352,-2.245737,1.796024,2.577588,1.309828,0.816804,1.159447,-0.105124,0.324456,-2.448992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353964,gonflette tour raciste frustré,0,0,34,1509,55,[],['https://t.co/pma33zhslx'],0,[],...,9.406568,0.976025,-6.433254,-9.610505,5.474191,6.246784,-2.700480,3.075080,-3.606831,-4.676166
353965,france caste crapuleuse encadrée gangsters irr...,0,0,89,11166,127,[],[],0,[],...,5.745199,-4.585106,0.103678,3.097386,7.551880,0.576369,-2.136248,-5.593320,-0.147425,-5.972623
353966,eric zemmour français berbère,3,0,1888,712,3086,[],[],0,[],...,8.833451,-7.896916,6.051399,-2.089946,11.787698,-3.638973,1.602436,8.457585,-4.419712,6.220350
353967,gauchistes dépression pq,0,0,139,486,320,[],[],0,[],...,4.354414,1.521424,-4.789271,-1.623836,4.442898,8.474722,1.036264,-4.212781,-2.904612,-3.216998


In [19]:
df_new.to_csv("data/train_with_embeddings.csv")

In [20]:
eval_data = pd.read_csv("data/evaluation.csv")

In [21]:
embeddings = np.zeros((eval_data.shape[0], n_dim))

In [22]:
for idx in range(eval_data.shape[0]):
    for s in eval_data.loc[idx, "text"].split():
        try:
            embeddings[idx] += model.wv[unidecode(s).lower()]
        except:
            pass
            # print(idx, unidecode(s).lower())

In [23]:
df_new = pd.concat([eval_data, pd.DataFrame(embeddings)], axis=1)

In [24]:
display(df_new)

Unnamed: 0,text,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,...,10,11,12,13,14,15,16,17,18,19
0,descendue pourcentage vote,0,85,4442,327,[],[],0,"['presidentielle2022', 'hidalgo']",1647607994000,...,7.181217,0.309834,-6.332445,-0.038968,-1.221310,1.079026,6.333026,1.601279,-2.306370,-7.510679
1,photo demande arbre généalogique,0,427,33282,443,[],[],0,[],1647601275000,...,-4.077580,-2.140409,-0.264883,-2.373578,1.020362,6.271716,8.287946,3.359548,-0.159418,-2.505842
2,dossiers promis poste gars vraiment fiable,6,1127,13111,1596,[],[],0,['macron'],1646900684000,...,6.489185,-1.099348,-0.596989,-11.313568,2.302437,9.751243,13.379056,-6.036166,-0.021749,-6.416997
3,lol cas faut second tour macron sortir définit...,2,1699,25760,2036,[],['https://t.co/7ox54d8dlm'],0,[],1647161294000,...,39.986729,-17.924932,-14.377242,-4.665369,4.240298,-4.796152,32.309215,17.484720,-11.089073,-39.486376
4,zemmour putain discours propagandiste littéral...,0,249,20718,369,[],"['https://t.co/tsw9cg6v9j', 'https://t.co/t0j9...",0,[],1647724874000,...,18.627066,-13.655215,-8.194701,1.479975,28.361922,36.898717,17.018538,0.464847,-10.626442,-14.024799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117985,balais inexcusable vote veut,0,4,194,64,[],[],0,[],1647103416000,...,7.677557,-0.913958,-2.305262,0.254968,4.842157,2.764827,8.502166,4.209280,0.309728,-9.193544
117986,balayant rap borloo mâles blcs commis faute fo...,0,744,39489,894,[],['https://t.co/im5cs5ur1a'],0,[],1647490899000,...,18.543763,-8.398313,-3.675114,-14.704332,13.917856,1.275444,9.692873,-17.828914,-0.809310,-15.058056
117987,triste réalité covid tue malades macron veille...,12,924,2270,159,[],[],0,[],1647522384000,...,22.999557,-16.646468,4.989533,4.347392,14.047281,8.400789,6.236346,-9.535482,-16.788091,-11.952025
117988,excellente idée pouvez réussir démarche crise ...,0,5,4,80,[],['https://t.co/xontgm06pb'],0,[],1647607262000,...,8.774021,-10.674966,-3.177897,-1.720304,21.019253,5.053758,18.258945,-13.476447,-13.404828,-14.163605


In [25]:
df_new.to_csv("data/evaluation_with_embeddings.csv")