***
# Modele de prédiction
***

In [1]:
# ====== Importer les library ====== #
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [7]:
# ====== Déterminer les path ====== #
import os
cwd = os.getcwd()
parent = os.path.dirname(cwd)

data_path = parent + '\\data\\'
df_path = cwd + '\\dataframeTest\\'

In [36]:
test = pd.read_csv(data_path + 'Test.csv')
posts = pd.read_csv(data_path + 'Posts.csv')

In [34]:
len(test)

107470

In [38]:
# ====== Merge ====== # 
df_merge = pd.merge(test, posts, left_on='postId', right_on='id', indicator=True, suffixes=('_comments', '_posts'))
df_merge = df_merge.reset_index(drop=False).rename(columns={'index':'id'})
df_merge = df_merge.drop(columns='_merge')

In [39]:
# ====== convertir le temps en format datetime ====== #
df_merge['comment_time'] = pd.to_datetime(df_merge['created_time_comments'])
df_merge['post_time'] = pd.to_datetime(df_merge['created_time_posts'])
df_merge = df_merge.drop(columns=['created_time_comments', 'created_time_posts'])

# Éliminer les colonnes non-pertinantes
colones_non_utiles = ['id_comments', 'parent', 'postId', 'attachments.data', 'id_posts', 'permalink_url']
df_merge = df_merge.drop(columns=colones_non_utiles)
df_merge.head(2)

# Attribut différence de temps
df_merge['time_difference'] = df_merge['comment_time'] - df_merge['post_time'] 
df_merge['time_difference'] = df_merge['time_difference'].astype('timedelta64[m]')
df_merge.head(2)

# Attributs temps solo
df_merge['year'] = df_merge['comment_time'].dt.year
df_merge['month'] = df_merge['comment_time'].dt.month
df_merge['weekday'] = df_merge['comment_time'].dt.weekday       # Monday=0, Sunday=6
df_merge = df_merge.drop(columns=['comment_time', 'post_time'])

In [40]:
# ====== Extraire les Émoji ====== #
import re
import emoji

def findEmoji(text):
    emojis = re.findall("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", text)
    return emojis


##### Créer une liste de valeur plutot qu'un bloc de text #####
def emojiToText(text):
    emojiInText = emoji.demojize(text)
    emojiInText = emojiInText[1:-1]
    return emojiInText

df_merge['emojis'] = df_merge['message_comments'].apply(lambda x: findEmoji(x))
df_merge['emojis_text'] = df_merge['emojis'].apply(lambda x:[emojiToText(value) for value in x])

# Attribut nombre d'émoji
df_merge['emoji_count'] = df_merge['emojis_text'].apply(lambda x: len(x))


In [92]:
# ====== Subject ====== #

# Df avec les sujet attribué aux articles
df = pd.read_hdf(df_path + 'df_topics.h5')
df_merge = pd.merge(left=df_merge, right=df, how='left', on='title')

In [44]:
df_merge['title_classify'].unique()

Unnamed: 0,title,title_classify
0,Une électrification à « la mesure des besoins ...,Technologie
1,Les incohérences de l’ingérence,Environnement


In [None]:
from fonctions import modiliserSujetDesArticles
titre_manquant = df_merge.loc[df_merge['title_classify'].isna()]
titre_manquant = titre_manquant[['message_posts']].drop_duplicates().dropna().reset_index(drop=True)
titre_manquant_list = titre_manquant['message_posts'].to_list()

# Fonction attribut un sujet
subject = modiliserSujetDesArticles(titre_manquant_list)

titre_manquant['title'] = subject
titre_manquant

In [100]:
df_merge = pd.merge(left=df_merge, right=titre_manquant, how='left', on='title')

df_merge['title_classify'] = np.where(df_merge['title_classify_x'].isna(), df_merge['title_classify_y'], df_merge['title_classify_x'])
df_merge = df_merge.drop(columns=['title_classify_x', 'message_posts', 'title_classify_y'])
df_merge.head(2)

Unnamed: 0,id,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts_x,secondTopic,shares,title,time_difference,year,month,weekday,emojis,emojis_text,emoji_count,title_classify
0,0,0.000548,0.007893,0.0118,0.000517,0.005419,0.006754,-1,0,Mon ex🤔?,chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,1.0,2023,6,3,[🤔],[thinking_face],1,Éducation
1,1,0.125566,0.469236,0.501357,0.169603,0.012389,0.434752,-1,0,"""Quel est ton nom ? Je te l' ordonne de le di...",chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,20.0,2023,6,3,[],[],0,Éducation


In [101]:
df_merge.loc[df_merge['title_classify'].isna()]

Unnamed: 0,id,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts_x,secondTopic,shares,title,time_difference,year,month,weekday,emojis,emojis_text,emoji_count,title_classify
3813,3813,0.000180,0.008368,0.008256,0.000154,0.005241,0.004335,-1,6,Pas certaine que c’est un changement dans le b...,,Qui dit changement dit nouveauté et adaptabili...,,1,,58.0,2023,4,4,[],[],0,
3814,3814,0.000405,0.010895,0.008828,0.000234,0.005217,0.007760,-1,3,"Je n’aime pas du tout cette formule, en partic...",,Qui dit changement dit nouveauté et adaptabili...,,1,,162.0,2023,4,4,[],[],0,
3815,3815,0.000243,0.008862,0.008862,0.000198,0.005268,0.005372,-1,0,"Effectivement, je ne me retrouve plus dans la ...",,Qui dit changement dit nouveauté et adaptabili...,,1,,1234.0,2023,4,5,[],[],0,
3816,3816,0.002719,0.081139,0.047674,0.002356,0.005780,0.126940,-1,4,J’ai décidé de me désabonner après plus de 15 ...,,Qui dit changement dit nouveauté et adaptabili...,,1,,1259.0,2023,4,5,[],[],0,
3817,3817,0.000135,0.005793,0.007842,0.000104,0.005242,0.001579,-1,4,Catherine Chagnon Je suis tout à fait d’accord...,,Qui dit changement dit nouveauté et adaptabili...,,1,,2877.0,2023,4,6,[],[],0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85423,85423,0.007844,0.174862,0.061953,0.007324,0.006110,0.166786,-1,1,Faudrait pas qu'ils se fassent tester et faire...,,Bilan COVID du 11 mai au Québec 🦠\nTous les dé...,,1,,1257.0,2021,5,2,[],[],0,
85424,85424,0.002645,0.227703,0.012893,0.002556,0.005422,0.182691,-1,0,Nicolas Perrot Porteur du négatif.,,Bilan COVID du 11 mai au Québec 🦠\nTous les dé...,,1,,2108.0,2021,5,3,[],[],0,
93753,93753,0.001406,0.024253,0.015045,0.001736,0.005838,0.032863,-1,0,On dirait des doigts....,,Un petit pique-nique au milieu de la rivière P...,,2,,577.0,2021,7,6,[],[],0,
94808,94808,0.002137,0.007551,0.013132,0.000997,0.006816,0.009048,-1,1,Felix Girard,,Envie de découvrir la ville de Québec en goûta...,,3,,423.0,2021,7,0,[],[],0,


In [63]:
test = df_merge.loc[df_merge['title_classify'].isna()]
print(len(test))
test = test['title_classify'].drop_duplicates()
print(len(test))
test

1319
1


3813    NaN
Name: title_classify, dtype: object

In [48]:
# S'assurer qu'il n'y ait pas de valeurs manquantes
df_merge['title_classify'].unique()

array(['Éducation', 'Justice', 'Sport', 'Environnement', 'Économie',
       'Technologie', 'Santé', 'COVID', 'Politique', nan], dtype=object)

In [33]:
len(df_merge)

4899

In [31]:
df_merge.to_hdf(df_path + 'df_merge.h5', key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['message_comments', 'mainTopic', 'message_posts', 'secondTopic',
       'title', 'emojis', 'emojis_text', 'title_classify'],
      dtype='object')]

  df_merge.to_hdf(df_path + 'df_merge.h5', key='s')


In [None]:
df_merge =pd.read_hdf(df_path + 'df_merge.h5')

In [32]:
df_merge.head(2)

Unnamed: 0,id,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,time_difference,year,month,weekday,emojis,emojis_text,emoji_count,title_classify
0,585,0.001591,0.048521,0.008845,0.000844,0.005471,0.0481,-1,0,Un mouvement annoncé. Maintenant que le projet...,composer-preview,Les citoyens de Saint-Sacrement veulent des st...,X4CU3G7OUNBJHDFGPHGB5LM55U,1,Saint-Sacrement veut des stations de tramway p...,89.0,2023,5,4,[],[],0,Santé
1,586,0.000735,0.024511,0.008456,0.000491,0.005392,0.025557,-1,0,Gerard Content \n\nIl est grand temps de se fa...,composer-preview,Les citoyens de Saint-Sacrement veulent des st...,X4CU3G7OUNBJHDFGPHGB5LM55U,1,Saint-Sacrement veut des stations de tramway p...,187.0,2023,5,4,[],[],0,Santé
