***
# Nettoyage des données
***

In [1]:
# ====== Importer les library ====== #
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [11]:
# ====== Déterminer les path ====== #
import os
cwd = os.getcwd()
parent = os.path.dirname(cwd)

data_path = parent + '\\data\\'

In [14]:
# ====== Importer les df ====== #
comments = pd.read_csv(data_path + 'Comments.csv')
posts = pd.read_csv(data_path + 'Posts.csv')

comments.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,created_time,id,like_count,message,parent,postId
0,0.102216,0.651296,0.664565,0.350583,0.029933,0.588517,0,2023-06-02T16:06:59+0000,722477749883613_788168749338745,0,"Faux, ce ne sera jamais le temps pour lui car ...",,192978590727638_722477749883613
1,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,1,2023-06-02T16:21:43+0000,722477749883613_1593974021087773,1,Mon dieu!! On nous prend sérieusement pour des...,,192978590727638_722477749883613


In [None]:
# ====== Merge les df ====== #
df_merge = pd.merge(comments, posts, left_on='postId', right_on='id', indicator=True, suffixes=('_comments', '_posts'))
df_merge.head(2)


# Analyse du merge 
unique_list = []
for value in df_merge['_merge'].unique():
    unique_list.append(value)

if (len(unique_list) == 1) & (unique_list[0] == 'both'):
    print('Tous les lignes merge')
else :
    print('Certaines lignes ne sont pas merge')


df_merge = df_merge.drop(columns='_merge')

In [None]:
# ====== Visualisation des valeurs manquantes ====== #
print('Missing value')
print('*'*25)
print(f'Nombre de lignes : {len(df_merge)}')
for col in df_merge.columns.to_list():
    nb_na = df_merge[col].isna().sum()
    perc = round(df_merge[col].isna().sum()/len(df_merge)*100)
    print(f'{col.ljust(21)} : {str(nb_na).rjust(11)} ({perc}%)')

In [None]:
# ====== convertir le temps en format datetime ====== #
df_merge['comment_time'] = pd.to_datetime(df_merge['created_time_comments'])
df_merge['post_time'] = pd.to_datetime(df_merge['created_time_posts'])
df_merge = df_merge.drop(columns=['created_time_comments', 'created_time_posts'])
df_merge.head(2)

In [None]:
# ====== Ajout d'information en lien avec l'attribut parent ====== #

# L'attribut prent indique le id du commentaire auquel ce commentaire est répondu.
# Les valeurs manquantes indiquent que le commentaire répond à la publication originale.
# Possibilité d'intégrer de l'info du commentaire répondu

# Ajout de colone binaire pour la réponse à un commentaire ou a la publication originale
df_merge['reponse_a_commentaire'] = np.where(df_merge['parent'].notna(), 1, 0)
df_merge['reponse_a_publication'] = np.where(df_merge['parent'].isna(), 1, 0)

# Ajout des indices du commentaire parent
cols = ['IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'SEVERE_TOXICITY', 'THREAT', 'TOXICITY', 'message_comments', 'id_comments']
df_merge = pd.merge(df_merge, df_merge[cols], left_on='parent', right_on='id_comments', suffixes=('', '_parent'))

df_merge.head(2)

In [None]:
# ====== Éliminer les colonnes non-pertinantes ====== #
colones_non_utiles = ['id_comments', 'parent', 'postId', 'attachments.data', 'id_posts', 'id_comments_parent', 'permalink_url']
df_merge = df_merge.drop(columns=colones_non_utiles)
df_merge.head(2)