In [186]:
import csv
import numpy as np
import pandas as pd

import data
# Word2vec
from gensim.models import Word2Vec
from gensim.utils import tokenize
from unidecode import unidecode

In [187]:
train_data = pd.read_csv("data/train.csv")

In [188]:
# embedding dimension
n_dim = 64

In [189]:
sentences = [list(tokenize(s, deacc=True, lower=True)) for s in train_data['text']]
print(sentences[:5])

[['rt', 'refarcir', 'macron', 'ans', 'nom', 'prepare'], ['populaire'], ['faut', 'degager', 'cingle'], ['enseignants', 'mettre', 'prescriptions', 'president', 'republique', 'faut', 'croire', 'president', 'donner', 'periode', 'compte', 'mesures', 'appliquees', 'grever'], ['mafieuse', 'oppressive', 'macron']]


In [190]:
model = Word2Vec(vector_size=n_dim, window=5, min_count=3, workers=7)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=30)

(88188946, 98253810)

In [191]:
embeddings = np.zeros((train_data.shape[0], n_dim))

In [192]:
found, not_found = [], []
usage = []

In [193]:
for idx in range(train_data.shape[0]):
    curr_usage = 0
    for s in train_data.loc[idx, "text"].split():
        try:
            if len(s) >= 3:
                embeddings[idx] += model.wv[unidecode(s).lower()]
                found.append(s)
                curr_usage += 1
        except:
            not_found.append(s)
            pass
            # print(idx, unidecode(s).lower())
    usage.append((curr_usage, len(train_data.loc[idx, "text"].split())))

In [194]:
print(found[:100], not_found[:100])

['macron', 'ans', 'nom', 'prépare', 'populaire', 'faut', 'dégager', 'cinglé', 'enseignants', 'mettre', 'prescriptions', 'président', 'république', 'faut', 'croire', 'président', 'donner', 'période', 'compte', 'mesures', 'appliquées', 'grever', 'mafieuse', 'oppressive', 'macron', 'équipe', 'campagne', 'macron', 'prie', 'soutien', 'sarko', 'bizarre', 'bizarre', 'cousin', 'sensés', 'dems', 'votent', 'pauvre', 'envoies', 'macron', 'chameau', 'cheval', 'dessiné', 'commission', 'emmanuel', 'macron', 'annule', 'venue', 'soirée', 'électorale', 'bfmtv', 'chaîne', 'prend', 'décision', 'radicale', 'violer', 'pauvre', 'ukrainienne', 'africains', 'düsseldorf', 'petites', 'minutes', 'temps', 'temps', 'bfm', 'heures', 'offre', 'marcheurs', 'roi', 'connard', 'dégoûte', 'poutou', 'continuer', 'battre', 'gonfler', 'intentions', 'vote', 'quinquennat', 'macron', 'série', 'ininterrompue', 'plans', 'com', 'apparence', 'passe', 'melenchon', 'jamais', 'insulter', 'macron', 'publiquement', 'plateau', 'remarque

In [195]:
print(len(found))
print(len(not_found))

3094170
70814


In [196]:
print(usage[:100])

[(4, 6), (1, 1), (3, 3), (14, 14), (3, 3), (13, 13), (6, 6), (11, 11), (5, 6), (14, 14), (3, 3), (7, 9), (2, 2), (17, 19), (6, 7), (10, 10), (4, 4), (2, 2), (8, 8), (8, 8), (4, 4), (4, 4), (4, 5), (16, 16), (18, 19), (13, 13), (18, 18), (6, 6), (10, 10), (19, 19), (4, 4), (2, 2), (9, 12), (12, 14), (8, 8), (11, 11), (13, 13), (13, 13), (8, 8), (6, 7), (7, 7), (18, 18), (4, 4), (7, 7), (14, 14), (14, 15), (7, 7), (12, 12), (8, 10), (3, 3), (5, 5), (19, 19), (11, 11), (6, 7), (1, 1), (16, 18), (3, 3), (9, 13), (4, 4), (12, 12), (12, 12), (6, 6), (6, 7), (8, 8), (1, 1), (1, 2), (2, 2), (3, 4), (12, 13), (13, 14), (4, 4), (5, 5), (4, 7), (10, 10), (5, 5), (8, 9), (5, 5), (17, 17), (8, 9), (11, 11), (8, 8), (3, 3), (4, 4), (3, 4), (18, 18), (5, 6), (9, 9), (6, 8), (4, 4), (13, 13), (6, 7), (6, 7), (7, 7), (14, 15), (4, 4), (16, 16), (13, 14), (4, 4), (2, 3), (6, 6)]


In [197]:
df_new = pd.concat([train_data, pd.DataFrame(embeddings)], axis=1)

In [198]:
display(df_new)

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,...,54,55,56,57,58,59,60,61,62,63
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],...,1.477375,-2.619478,2.379451,-2.100691,-2.053618,5.765649,4.045572,0.943415,0.226367,1.516840
1,populaire,0,0,86,1016,284,[],[],0,[],...,-1.561968,-1.053356,0.227053,1.417470,-0.322939,-0.043318,-0.622173,0.363898,-0.392036,0.406479
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],...,-0.898627,1.418166,2.022078,3.905925,1.718834,-1.769465,1.552497,1.931623,-4.257294,2.926098
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],...,-2.387572,-18.162987,-7.080737,6.985964,1.799002,8.261959,8.254217,10.174779,-10.607723,3.600731
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],...,-0.193948,0.149649,-0.171341,-1.353654,-1.661343,0.647350,1.451610,0.256563,-1.216950,0.562472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353964,gonflette tour raciste frustré,0,0,34,1509,55,[],['https://t.co/pma33zhslx'],0,[],...,-3.957497,-5.173093,6.045044,2.488039,-0.561643,-3.089756,3.169811,-1.183444,-3.719602,3.344714
353965,france caste crapuleuse encadrée gangsters irr...,0,0,89,11166,127,[],[],0,[],...,3.630719,3.071452,-6.633868,-0.712121,-2.596358,3.378789,3.095215,2.306328,-2.475461,0.893007
353966,eric zemmour français berbère,3,0,1888,712,3086,[],[],0,[],...,-3.145975,-5.781182,-0.282523,-0.471046,1.176291,0.077825,4.831985,-2.863747,1.072655,-0.303862
353967,gauchistes dépression pq,0,0,139,486,320,[],[],0,[],...,-1.216523,2.407117,-0.331728,1.456821,2.062268,0.621411,-0.881871,0.934669,-0.486399,-0.165880


In [199]:
file_path = f"data/train_with_embeddings.csv"

In [200]:
df_new.to_csv(file_path)

In [201]:
eval_data = pd.read_csv("data/evaluation.csv")

In [202]:
embeddings = np.zeros((eval_data.shape[0], n_dim))

In [203]:
for idx in range(eval_data.shape[0]):
    for s in eval_data.loc[idx, "text"].split():
        try:
            embeddings[idx] += model.wv[unidecode(s).lower()]
        except:
            pass
            # print(idx, unidecode(s).lower())

In [204]:
df_new = pd.concat([eval_data, pd.DataFrame(embeddings)], axis=1)

In [205]:
display(df_new)

Unnamed: 0,text,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,...,54,55,56,57,58,59,60,61,62,63
0,descendue pourcentage vote,0,85,4442,327,[],[],0,"['presidentielle2022', 'hidalgo']",1647607994000,...,-4.936539,-2.252889,1.546269,2.336316,-4.728228,1.464929,2.274205,-0.609707,-1.210787,3.147254
1,photo demande arbre généalogique,0,427,33282,443,[],[],0,[],1647601275000,...,-2.244986,-0.859784,3.453660,-2.098631,-1.114366,-1.090945,-1.754593,-1.415446,4.714002,-0.078210
2,dossiers promis poste gars vraiment fiable,6,1127,13111,1596,[],[],0,['macron'],1646900684000,...,0.785001,-6.398176,2.500649,1.689417,1.548272,-2.272700,1.728426,2.205752,-4.645946,0.001023
3,lol cas faut second tour macron sortir définit...,2,1699,25760,2036,[],['https://t.co/7ox54d8dlm'],0,[],1647161294000,...,-19.472089,-9.797417,10.106725,10.896217,-23.997370,3.380084,3.852576,-2.348574,-1.046452,4.525420
4,zemmour putain discours propagandiste littéral...,0,249,20718,369,[],"['https://t.co/tsw9cg6v9j', 'https://t.co/t0j9...",0,[],1647724874000,...,2.101488,-3.720692,14.217625,3.902875,11.231026,4.524144,3.303629,5.939966,-12.227510,13.097947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117985,balais inexcusable vote veut,0,4,194,64,[],[],0,[],1647103416000,...,-3.272253,-0.594626,3.145165,1.599861,-1.518942,2.240286,0.610335,2.336141,-1.096964,6.024372
117986,balayant rap borloo mâles blcs commis faute fo...,0,744,39489,894,[],['https://t.co/im5cs5ur1a'],0,[],1647490899000,...,-2.681604,1.479559,-3.693645,0.116239,4.204256,-2.152952,2.661536,0.069991,-8.975336,9.960458
117987,triste réalité covid tue malades macron veille...,12,924,2270,159,[],[],0,[],1647522384000,...,0.425202,-2.737566,1.363717,-5.270310,-1.445753,10.096212,2.660169,-5.796501,-6.049362,-1.261608
117988,excellente idée pouvez réussir démarche crise ...,0,5,4,80,[],['https://t.co/xontgm06pb'],0,[],1647607262000,...,0.913784,7.479163,-13.498572,3.931908,-0.286913,-2.445809,-8.773067,-7.044116,-4.967737,-2.193959


In [206]:
file_path = f"data/evaluation_with_embeddings.csv"

In [None]:
df_new.to_csv(file_path)