***
# Nettoyage des donn√©es
***

In [39]:
# ====== Importer les library ====== #
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [40]:
# ====== D√©terminer les path ====== #
import os
cwd = os.getcwd()
parent = os.path.dirname(cwd)

data_path = parent + '\\data\\'

In [41]:
# ====== Importer les df ====== #
comments = pd.read_csv(data_path + 'Comments.csv')
posts = pd.read_csv(data_path + 'Posts.csv')

comments.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,created_time,id,like_count,message,parent,postId
0,0.102216,0.651296,0.664565,0.350583,0.029933,0.588517,0,2023-06-02T16:06:59+0000,722477749883613_788168749338745,0,"Faux, ce ne sera jamais le temps pour lui car ...",,192978590727638_722477749883613
1,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,1,2023-06-02T16:21:43+0000,722477749883613_1593974021087773,1,Mon dieu!! On nous prend s√©rieusement pour des...,,192978590727638_722477749883613


In [95]:
# ====== Merge les df ====== #
df_merge = pd.merge(comments, posts, left_on='postId', right_on='id', indicator=True, suffixes=('_comments', '_posts'))
df_merge.head(2)


# Analyse du merge 
unique_list = []
for value in df_merge['_merge'].unique():
    unique_list.append(value)

if (len(unique_list) == 1) & (unique_list[0] == 'both'):
    print('Tous les lignes merge')
else :
    print('Certaines lignes ne sont pas merge')


df_merge = df_merge.drop(columns='_merge')

Tous les lignes merge


***
### Compl√©ter les valeurs manquantes
***

In [34]:
# ====== Visualisation des valeurs manquantes ====== #
print('Missing value')
print('*'*25)
print(f'Nombre de lignes : {len(df_merge)}')
for col in df_merge.columns.to_list():
    nb_na = df_merge[col].isna().sum()
    perc = round(df_merge[col].isna().sum()/len(df_merge)*100)
    print(f'{col.ljust(21)} : {str(nb_na).rjust(11)} ({perc}%)')

Missing value
*************************
Nombre de lignes : 935698
IDENTITY_ATTACK       :           0 (0%)
INSULT                :           0 (0%)
PROFANITY             :           0 (0%)
SEVERE_TOXICITY       :           0 (0%)
THREAT                :           0 (0%)
TOXICITY              :           0 (0%)
comment_count         :           0 (0%)
id_comments           :           0 (0%)
like_count            :           0 (0%)
message_comments      :           0 (0%)
parent                :      459661 (49%)
postId                :           0 (0%)
attachments.data      :          35 (0%)
id_posts              :           0 (0%)
mainTopic             :        8481 (1%)
message_posts         :       10158 (1%)
permalink_url         :           0 (0%)
secondTopic           :       67076 (7%)
shares                :           0 (0%)
title                 :        8457 (1%)
_merge                :           0 (0%)
comment_time          :           0 (0%)
post_time             :        

In [96]:
# ====== convertir le temps en format datetime ====== #
df_merge['comment_time'] = pd.to_datetime(df_merge['created_time_comments'])
df_merge['post_time'] = pd.to_datetime(df_merge['created_time_posts'])
df_merge = df_merge.drop(columns=['created_time_comments', 'created_time_posts'])
df_merge.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,id_comments,like_count,message_comments,parent,postId,attachments.data,id_posts,mainTopic,message_posts,permalink_url,secondTopic,shares,title,comment_time,post_time
0,0.102216,0.651296,0.664565,0.350583,0.029933,0.588517,0,722477749883613_788168749338745,0,"Faux, ce ne sera jamais le temps pour lui car ...",,192978590727638_722477749883613,"[{""title"":""Les incoh√©rences de l‚Äôing√©rence"",""u...",192978590727638_722477749883613,composer-preview,H√âL√àNE BUZZETTI / Jagmeet Singh a soutenu que ...,https://www.facebook.com/553956616735728/posts...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incoh√©rences de l‚Äôing√©rence,2023-06-02 16:06:59+00:00,2023-06-02 16:03:24+00:00
1,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,1,722477749883613_1593974021087773,1,Mon dieu!! On nous prend s√©rieusement pour des...,,192978590727638_722477749883613,"[{""title"":""Les incoh√©rences de l‚Äôing√©rence"",""u...",192978590727638_722477749883613,composer-preview,H√âL√àNE BUZZETTI / Jagmeet Singh a soutenu que ...,https://www.facebook.com/553956616735728/posts...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incoh√©rences de l‚Äôing√©rence,2023-06-02 16:21:43+00:00,2023-06-02 16:03:24+00:00


In [97]:
# ====== Ajout d'information en lien avec l'attribut parent ====== #

# L'attribut prent indique le id du commentaire auquel ce commentaire est r√©pondu.
# Les valeurs manquantes indiquent que le commentaire r√©pond √† la publication originale.
# Possibilit√© d'int√©grer de l'info du commentaire r√©pondu

# Ajout de colone binaire pour la r√©ponse √† un commentaire ou a la publication originale
df_merge['reponse_a_commentaire'] = np.where(df_merge['parent'].notna(), 1, 0)
df_merge['reponse_a_publication'] = np.where(df_merge['parent'].isna(), 1, 0)

# Ajout des indices du commentaire parent
cols = ['IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'SEVERE_TOXICITY', 'THREAT', 'TOXICITY', 'message_comments', 'id_comments']
df_merge = pd.merge(df_merge, df_merge[cols], left_on='parent', right_on='id_comments', suffixes=('', '_parent'))

df_merge.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,id_comments,like_count,message_comments,parent,postId,attachments.data,id_posts,mainTopic,message_posts,permalink_url,secondTopic,shares,title,comment_time,post_time,reponse_a_commentaire,reponse_a_publication,IDENTITY_ATTACK_parent,INSULT_parent,PROFANITY_parent,SEVERE_TOXICITY_parent,THREAT_parent,TOXICITY_parent,message_comments_parent,id_comments_parent
0,0.029144,0.269578,0.016753,0.006104,0.005864,0.246033,0,722477749883613_779062990381463,0,"Denise Morais ..Oui et avec raison ! (Car ""nou...",722477749883613_1593974021087773,192978590727638_722477749883613,"[{""title"":""Les incoh√©rences de l‚Äôing√©rence"",""u...",192978590727638_722477749883613,composer-preview,H√âL√àNE BUZZETTI / Jagmeet Singh a soutenu que ...,https://www.facebook.com/553956616735728/posts...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incoh√©rences de l‚Äôing√©rence,2023-06-02 17:29:58+00:00,2023-06-02 16:03:24+00:00,1,0,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,Mon dieu!! On nous prend s√©rieusement pour des...,722477749883613_1593974021087773
1,0.030433,0.367985,0.0359,0.0103,0.006227,0.28549,0,722430586554996_952366096187557,0,c'est √©trange √† quel point les illumin√© des r√©...,722430586554996_2731418863667914,192978590727638_722430586554996,"[{""title"":""Legault veut emp√™cher les antiavort...",192978590727638_722430586554996,actualites,"La ministre qu√©b√©coise du Tourisme, Caroline P...",https://www.facebook.com/553956616735728/posts...,2023,0,Legault veut emp√™cher les antiavortement de fa...,2023-06-02 16:53:40+00:00,2023-06-02 14:55:09+00:00,1,0,0.236189,0.259987,0.02905,0.017998,0.007314,0.284252,"Au Canada comme en Chine, l'avortement jusqu'√†...",722430586554996_2731418863667914


In [98]:
# ====== √âliminer les colonnes non-pertinantes ====== #
colones_non_utiles = ['id_comments', 'parent', 'postId', 'attachments.data', 'id_posts', 'id_comments_parent', 'permalink_url']
df_merge = df_merge.drop(columns=colones_non_utiles)
df_merge.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,comment_time,post_time,reponse_a_commentaire,reponse_a_publication,IDENTITY_ATTACK_parent,INSULT_parent,PROFANITY_parent,SEVERE_TOXICITY_parent,THREAT_parent,TOXICITY_parent,message_comments_parent
0,0.029144,0.269578,0.016753,0.006104,0.005864,0.246033,0,0,"Denise Morais ..Oui et avec raison ! (Car ""nou...",composer-preview,H√âL√àNE BUZZETTI / Jagmeet Singh a soutenu que ...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incoh√©rences de l‚Äôing√©rence,2023-06-02 17:29:58+00:00,2023-06-02 16:03:24+00:00,1,0,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,Mon dieu!! On nous prend s√©rieusement pour des...
1,0.030433,0.367985,0.0359,0.0103,0.006227,0.28549,0,0,c'est √©trange √† quel point les illumin√© des r√©...,actualites,"La ministre qu√©b√©coise du Tourisme, Caroline P...",2023,0,Legault veut emp√™cher les antiavortement de fa...,2023-06-02 16:53:40+00:00,2023-06-02 14:55:09+00:00,1,0,0.236189,0.259987,0.02905,0.017998,0.007314,0.284252,"Au Canada comme en Chine, l'avortement jusqu'√†..."


***
## Les sujets
***

In [76]:
test = df_merge[['mainTopic']].copy()
test = test.groupby('mainTopic').size().reset_index().rename(columns={0:'count'}).sort_values(by=['count'], ascending=False).reset_index(drop=True)
test.to_excel('mainTopic.xlsx')
test

# Note en tappant la valeur aberrante sur google, on retombe sur l'article, probablement une partie de l'adresse html


Unnamed: 0,mainTopic,count
0,actualite,198021
1,2022,66131
2,actualites,48292
3,2023,30253
4,opinions,28952
...,...,...
890,2eaac8fca5f93ecd83a1d3efba2cc01b?utm_medium=So...,1
891,100ed36505cc620aa0dcc7cfb767a623?utm_medium=So...,1
892,7cd35e270a52418ae2c270e883208b8a?utm_medium=So...,1
893,0fa2dc5c6801cec0ffe1af766c2b1340,1


In [94]:
test2 = df_merge.copy()
# test2.loc[test2['mainTopic'] == 'dfb1c6e29873979d241ee7b86511563d'].head()

print(len(test2))



test2.loc[test2['mainTopic']=='2Ncrl4z'].head()
# test2 = test2.loc[test2['mainTopic'].str.len()<12]
# print(len(test2.loc[test2['mainTopic'].str.len()<12]))
# test2 = test2.groupby('mainTopic').size().reset_index().rename(columns={0:'count'}).sort_values(by=['count'], ascending=False).reset_index(drop=True)

# test2.to_excel('mainTopic_short.xlsx')


# print(len(test2.loc[test2['mainTopic'].str.len()<12]))
# test2.loc[test2['mainTopic'].str.len()<10]
# Note : 

475376


Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,comment_time,post_time,reponse_a_commentaire,reponse_a_publication,IDENTITY_ATTACK_parent,INSULT_parent,PROFANITY_parent,SEVERE_TOXICITY_parent,THREAT_parent,TOXICITY_parent
307921,0.001147,0.156985,0.011509,0.001411,0.005369,0.128675,0,6,"Dyan Beseski le gars semble assez ing√©nieux, √ß...",2Ncrl4z,"‚òÄÔ∏èüå± Au lieu de ruminer, un ancien soudeur de l...",,263,Il transforme son vieux Tempo en serre g√©other...,2021-02-16 14:16:27+00:00,2021-02-16 13:30:23+00:00,1,0,0.00098,0.046179,0.010672,0.001984,0.008531,0.06184
307922,0.000264,0.006059,0.009024,0.000225,0.005325,0.002576,0,0,C'est assur√©ment dans mes r√™vesssss,2Ncrl4z,"‚òÄÔ∏èüå± Au lieu de ruminer, un ancien soudeur de l...",,263,Il transforme son vieux Tempo en serre g√©other...,2021-02-16 16:15:39+00:00,2021-02-16 13:30:23+00:00,1,0,0.000916,0.007456,0.011048,0.000477,0.005583,0.006314
307923,0.00117,0.027347,0.038155,0.002766,0.005693,0.040678,0,0,Gege East-Goulet Dope! Super facile √† faire en...,2Ncrl4z,"‚òÄÔ∏èüå± Au lieu de ruminer, un ancien soudeur de l...",,263,Il transforme son vieux Tempo en serre g√©other...,2021-02-16 17:55:31+00:00,2021-02-16 13:30:23+00:00,1,0,0.001517,0.009394,0.011817,0.000887,0.005851,0.009676
307924,0.00444,0.066172,0.553563,0.018062,0.005923,0.177184,0,1,Guillaume Labonte Cote ou m√™me avoir un fuckin...,2Ncrl4z,"‚òÄÔ∏èüå± Au lieu de ruminer, un ancien soudeur de l...",,263,Il transforme son vieux Tempo en serre g√©other...,2021-02-16 18:15:45+00:00,2021-02-16 13:30:23+00:00,1,0,0.004847,0.063025,0.030389,0.005264,0.014566,0.109134
307925,0.000726,0.007779,0.007933,0.000294,0.005296,0.004712,0,1,Guillaume Labonte Cote les jardins en fa√ßade s...,2Ncrl4z,"‚òÄÔ∏èüå± Au lieu de ruminer, un ancien soudeur de l...",,263,Il transforme son vieux Tempo en serre g√©other...,2021-02-16 20:04:32+00:00,2021-02-16 13:30:23+00:00,1,0,0.004847,0.063025,0.030389,0.005264,0.014566,0.109134


***
## Extraction des mots dans les publications
***

1. Mots pertinent (les titres et les commentaires, les commentaires parents)
2. Les √©moji