***
# Pipeline
***

In [95]:
# ====== Importer les library ====== #
import pandas as pd
import numpy as np
from transformers import pipeline
from fonctions import findEmoji, emojiToText
# from fonctions import normalizeCols

pd.set_option('display.max_columns', None)

In [16]:
# ====== Déterminer les path ====== #
import os
cwd = os.getcwd()
parent = os.path.dirname(cwd)

data_path = parent + '\\data\\'
df_path = cwd + '\\dataframeTest\\'

In [17]:
test = pd.read_csv(data_path + 'Test.csv')
posts = pd.read_csv(data_path + 'Posts.csv')

***

In [18]:
# ====== Merge avec post ====== # 
df_merge = pd.merge(test, posts, left_on='postId', right_on='id', indicator=True, suffixes=('_comments', '_posts'))
df_merge = df_merge.reset_index(drop=False).rename(columns={'index':'id'})
df_merge = df_merge.drop(columns='_merge')

***

In [19]:
# ====== Attributs de temps ====== #
df_merge['comment_time'] = pd.to_datetime(df_merge['created_time_comments'])
df_merge['post_time'] = pd.to_datetime(df_merge['created_time_posts'])
df_merge = df_merge.drop(columns=['created_time_comments', 'created_time_posts'])

# Éliminer les colonnes non-pertinantes
colones_non_utiles = ['id_comments', 'parent', 'postId', 'attachments.data', 'id_posts', 'permalink_url']
df_merge = df_merge.drop(columns=colones_non_utiles)
df_merge.head(2)

# Attribut différence de temps
df_merge['time_difference'] = df_merge['comment_time'] - df_merge['post_time'] 
df_merge['time_difference'] = df_merge['time_difference'].astype('timedelta64[m]')
df_merge.head(2)

# Attributs temps solo
df_merge['year'] = df_merge['comment_time'].dt.year
df_merge['month'] = df_merge['comment_time'].dt.month
df_merge['weekday'] = df_merge['comment_time'].dt.weekday       # Monday=0, Sunday=6
df_merge = df_merge.drop(columns=['comment_time', 'post_time'])

***

In [27]:
# ====== Extraire les Émoji ====== #
df_merge['emojis'] = df_merge['message_comments'].apply(lambda x: findEmoji(x))
df_merge['emojis_text'] = df_merge['emojis'].apply(lambda x:[emojiToText(value) for value in x])

# Attribut nombre d'émoji
df_merge['emoji_count'] = df_merge['emojis_text'].apply(lambda x: len(x))

***

In [28]:
# ====== Subject ====== #
# Df avec les sujet attribué aux articles
df = pd.read_hdf(df_path + 'df_topics.h5')
df_merge = pd.merge(left=df_merge, right=df, how='left', on='title')

# Changer les na pour sujet = autre
df_merge['title_classify'] = np.where(df_merge['title_classify'].isna(), 'Autre', df_merge['title_classify'])

In [29]:
df_merge.head(2)

Unnamed: 0,id,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,time_difference,year,month,weekday,emojis,emojis_text,emoji_count,title_classify
0,0,0.000548,0.007893,0.0118,0.000517,0.005419,0.006754,-1,0,Mon ex🤔?,chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,1.0,2023,6,3,[🤔],[thinking_face],1,Éducation
1,1,0.125566,0.469236,0.501357,0.169603,0.012389,0.434752,-1,0,"""Quel est ton nom ? Je te l' ordonne de le di...",chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,20.0,2023,6,3,[],[],0,Éducation


***

In [None]:
# ====== Sentiments ====== #
df = df_merge[['message_comments']].copy()
list_to_process = df['message_comments'].tolist()

# Model
distilled_student_sentiment_classifier = pipeline(model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", return_all_scores=True, truncation=True)
result_multi_student = distilled_student_sentiment_classifier(list_to_process)

# Change to df
data = [[d["score"] for d in result_multi_student[i]] for i in range(len(result_multi_student))]
col_names = ['positive', 'neutral', 'negative']
df_multi_student = pd.DataFrame(data, columns=col_names)

In [None]:
df_merge = pd.concat([df_merge, df_multi_student], axis='columns')

In [None]:
# Save results
df_multi_student.to_hdf(df_path + 'sentiement_model_results.h5', key='s')

***

In [31]:
# ====== Longeur des commentaires ====== #
df_merge['comment_lenght'] = df_merge['message_comments'].apply(lambda x: len(str(x).split()))

***

In [35]:
# ====== Question ====== #
pipe = pipeline("text-classification", model="shahrukhx01/bert-mini-finetune-question-detection", max_length=512)

sequence_to_classify = df_merge['message_comments'].tolist()
list_classifier = pipe(sequence_to_classify, truncation=True)

# Transforme to df
question_df = pd.DataFrame(list_classifier)
question_df = question_df.rename(columns={'label':'question_label', 'score':'question_score'})

In [36]:
# Add classifer to df
cols = question_df.columns

for col in cols :
    print(col)
    if col not in df_merge.columns.to_list():
        df_merge = pd.concat([df_merge, question_df], axis='columns')

# Add AT
df_merge['AT'] = df_merge['message_comments'].apply(lambda x: 1 if '@' in x else 0)

df_merge['question_bool'] = np.where((df_merge['question_label'] =='LABEL_1') & (df_merge['question_score'] > 0.75), 1, 0)

question_label
question_score


In [38]:
df_merge.to_hdf(df_path + 'df_merge.h5', key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['message_comments', 'mainTopic', 'message_posts', 'secondTopic',
       'title', 'emojis', 'emojis_text', 'title_classify', 'question_label'],
      dtype='object')]

  df_merge.to_hdf(df_path + 'df_merge.h5', key='s')


In [37]:
df_merge.head(2)

Unnamed: 0,id,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,time_difference,year,month,weekday,emojis,emojis_text,emoji_count,title_classify,comment_lenght,question_label,question_score,AT,question_bool
0,0,0.000548,0.007893,0.0118,0.000517,0.005419,0.006754,-1,0,Mon ex🤔?,chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,1.0,2023,6,3,[🤔],[thinking_face],1,Éducation,2,LABEL_1,0.994859,0,1
1,1,0.125566,0.469236,0.501357,0.169603,0.012389,0.434752,-1,0,"""Quel est ton nom ? Je te l' ordonne de le di...",chroniques,CHRONIQUE / Entre 4 à 10% des Québécois entend...,patrick-duquette,0,Un troupeau de voix enragées,20.0,2023,6,3,[],[],0,Éducation,244,LABEL_0,0.885108,0,0


***

In [69]:
df_merge =pd.read_hdf(df_path + 'df_merge.h5')

In [70]:
def normalizeCols(df, cols):
    for col in cols:
        df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())
    return df

In [76]:
df_model = df_merge.copy()

df_model = pd.get_dummies(df_model, columns = ['year', 'weekday', 'title_classify'])

df_model['negative'] = 0
df_model['positive'] = 0

# Cols to keep
col_to_keep = ['IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'SEVERE_TOXICITY', 'THREAT', 'TOXICITY', 'like_count', 'shares', 'time_difference', 'positive', 'negative', 'comment_lenght', 'AT', 'question_bool', 'emoji_count']
# col_to_keep = ['IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'SEVERE_TOXICITY', 'THREAT', 'TOXICITY', 'like_count', 'shares', 'time_difference', 'comment_lenght', 'AT', 'question_bool', 'emoji_count']

columns = ['year', 'weekday', 'title_classify']

# Include hotencoded attributs
for col in df_model.columns.to_list():
    for column in columns:
        if column in col:
            col_to_keep.append(col)

df_model = df_model[col_to_keep]

# Au cas où erreur
if  'time_difference' in col_to_keep :
    df_model['time_difference'] = np.where(df_model['time_difference']<0, 0, df_model['time_difference'])

cols = ['like_count', 'shares', 'time_difference', 'comment_lenght', 'emoji_count']
df_model = normalizeCols(df_model, cols)
df_model.head()


Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,like_count,shares,time_difference,positive,negative,comment_lenght,AT,question_bool,emoji_count,year_2020,year_2021,year_2022,year_2023,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,title_classify_Autre,title_classify_COVID,title_classify_Environnement,title_classify_Justice,title_classify_Politique,title_classify_Santé,title_classify_Sport,title_classify_Technologie,title_classify_Économie,title_classify_Éducation
0,0.000548,0.007893,0.0118,0.000517,0.005419,0.006754,0.0,0.0,2e-06,0,0,0.000955,0,1,0.001117,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.125566,0.469236,0.501357,0.169603,0.012389,0.434752,0.0,0.0,4.3e-05,0,0,0.232092,0,0,0.0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0.000731,0.008349,0.008597,0.000434,0.00545,0.006723,0.0,0.0,0.000804,0,0,0.010506,0,0,0.001117,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0.007104,0.080892,0.077085,0.006142,0.006492,0.113367,0.0,0.0,0.002079,0,0,0.00191,0,0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,0.009471,0.071433,0.011971,0.001945,0.005539,0.098288,0.0,0.000154,3.8e-05,0,0,0.016237,0,0,0.001117,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [77]:
# Ajustements
df_model = df_model.drop(columns='title_classify_Autre')
df_model = df_model.drop(columns='emoji_count')


In [73]:
df_model.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,like_count,shares,time_difference,positive,negative,comment_lenght,AT,question_bool,emoji_count,year_2020,year_2021,year_2022,year_2023,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,title_classify_COVID,title_classify_Environnement,title_classify_Justice,title_classify_Politique,title_classify_Santé,title_classify_Sport,title_classify_Technologie,title_classify_Économie,title_classify_Éducation
0,0.000548,0.007893,0.0118,0.000517,0.005419,0.006754,0.0,0.0,2e-06,0,0,0.000955,0,1,0.001117,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0.125566,0.469236,0.501357,0.169603,0.012389,0.434752,0.0,0.0,4.3e-05,0,0,0.232092,0,0,0.0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [78]:
# ====== Predict ====== #
from joblib import dump, load
model = load('randomForest.joblib')

X = df_model
prediction = model.predict(X)

In [84]:
print(len(prediction))
print(len(df_merge))

107470
107470


In [92]:
final_results = pd.DataFrame(prediction)
final_results = final_results.reset_index(drop=False)
final_results

Unnamed: 0,index,0
0,0,1
1,1,0
2,2,0
3,3,0
4,4,1
...,...,...
107465,107465,0
107466,107466,0
107467,107467,0
107468,107468,0


In [94]:
np.savetxt('finalResults.txt', final_results, fmt='%d')