## récupération des commentaires du site dummyjson

In [90]:
import requests

# URL pour récupérer tous les commentaires
url = "https://dummyjson.com/comments?limit=0"

# Envoi de la requête GET
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    comments = data["comments"]
else:
    print("Erreur lors de la récupération des commentaires :", response.status_code)


### Vérification de la taille des commentaires

In [91]:
len(comments)

340

### Visualisation  des commentaires

In [92]:
import pandas as pd

comments_df = pd.DataFrame(comments)

comments_df.head()

Unnamed: 0,id,body,postId,user
0,1,This is some awesome thinking!,100,"{'id': 63, 'username': 'eburras1q'}"
1,2,What terrific math skills you’re showing!,27,"{'id': 71, 'username': 'omarsland1y'}"
2,3,You are an amazing writer!,61,"{'id': 29, 'username': 'jissetts'}"
3,4,Wow! You have improved so much!,8,"{'id': 19, 'username': 'bleveragei'}"
4,5,Nice idea!,62,"{'id': 70, 'username': 'cmasurel1x'}"


### Restructuration du DataFrame

In [93]:
comments_df['user_id'] = comments_df['user'].apply(lambda x: x['id'])
comments_df['username'] = comments_df['user'].apply(lambda x: x['username'])

comments_df.drop(columns=['user'], inplace=True)



In [94]:
comments_df.tail()

Unnamed: 0,id,body,postId,user_id,username
335,336,"I checked the clouds during midnight, and wond...",73,91,hyaknov2i
336,337,"This is such an Insta beauty, why waste it by ...",79,77,rkingswood24
337,338,Have we met before? If not – would you like to...,91,31,smargiottau
338,339,"With great pictures, comes great responsibility.",4,89,cdwyr2g
339,340,I could say this is one of the greatest pic of...,71,86,whuman2d


In [95]:
comments_df.to_csv('comments_df.csv', index=False)  

In [96]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
comments_df = pd.read_csv('comments_df.csv')  

In [97]:
comments_df.tail()

Unnamed: 0,id,body,postId,user_id,username
335,336,"I checked the clouds during midnight, and wond...",73,91,hyaknov2i
336,337,"This is such an Insta beauty, why waste it by ...",79,77,rkingswood24
337,338,Have we met before? If not – would you like to...,91,31,smargiottau
338,339,"With great pictures, comes great responsibility.",4,89,cdwyr2g
339,340,I could say this is one of the greatest pic of...,71,86,whuman2d


## Nettoyer les commentaires

In [98]:



# Fonction pour nettoyer les phrases
def clean_text(text):
    # Supprimer les caractères spéciaux, les liens et les emojis
    text = re.sub(r'http\S+', '', text)  # Supprimer les liens
    text = re.sub(r'[^\w\s]', '', text)  # Supprimer les caractères spéciaux
    text = text.encode('ascii', 'ignore').decode('ascii')  # Supprimer les emojis
    return text

# Appliquer le nettoyage à la colonne 'text'
comments_df['clean_text'] = comments_df['body'].apply(clean_text)
comments_df

Unnamed: 0,id,body,postId,user_id,username,clean_text
0,1,This is some awesome thinking!,100,63,eburras1q,This is some awesome thinking
1,2,What terrific math skills you’re showing!,27,71,omarsland1y,What terrific math skills youre showing
2,3,You are an amazing writer!,61,29,jissetts,You are an amazing writer
3,4,Wow! You have improved so much!,8,19,bleveragei,Wow You have improved so much
4,5,Nice idea!,62,70,cmasurel1x,Nice idea
...,...,...,...,...,...,...
335,336,"I checked the clouds during midnight, and wond...",73,91,hyaknov2i,I checked the clouds during midnight and wonde...
336,337,"This is such an Insta beauty, why waste it by ...",79,77,rkingswood24,This is such an Insta beauty why waste it by c...
337,338,Have we met before? If not – would you like to...,91,31,smargiottau,Have we met before If not would you like to g...
338,339,"With great pictures, comes great responsibility.",4,89,cdwyr2g,With great pictures comes great responsibility


### Charger le modèle Detoxify

In [99]:
from detoxify import Detoxify
model = Detoxify('original')

### Faire l’analyse des toxicités de ces commentaires avec le modèle detoxify

In [100]:
toxicity_scores = []
for text in comments_df['clean_text']:
    result = model.predict(text)
    toxicity_scores.append(result)

comments_df['toxicity_score'] = toxicity_scores

comments_df

Unnamed: 0,id,body,postId,user_id,username,clean_text,toxicity_score
0,1,This is some awesome thinking!,100,63,eburras1q,This is some awesome thinking,"{'toxicity': 0.0007598002, 'severe_toxicity': ..."
1,2,What terrific math skills you’re showing!,27,71,omarsland1y,What terrific math skills youre showing,"{'toxicity': 0.0074881366, 'severe_toxicity': ..."
2,3,You are an amazing writer!,61,29,jissetts,You are an amazing writer,"{'toxicity': 0.00063423335, 'severe_toxicity':..."
3,4,Wow! You have improved so much!,8,19,bleveragei,Wow You have improved so much,"{'toxicity': 0.00094851875, 'severe_toxicity':..."
4,5,Nice idea!,62,70,cmasurel1x,Nice idea,"{'toxicity': 0.00065386726, 'severe_toxicity':..."
...,...,...,...,...,...,...,...
335,336,"I checked the clouds during midnight, and wond...",73,91,hyaknov2i,I checked the clouds during midnight and wonde...,"{'toxicity': 0.0006925876, 'severe_toxicity': ..."
336,337,"This is such an Insta beauty, why waste it by ...",79,77,rkingswood24,This is such an Insta beauty why waste it by c...,"{'toxicity': 0.013269149, 'severe_toxicity': 0..."
337,338,Have we met before? If not – would you like to...,91,31,smargiottau,Have we met before If not would you like to g...,"{'toxicity': 0.0008845267, 'severe_toxicity': ..."
338,339,"With great pictures, comes great responsibility.",4,89,cdwyr2g,With great pictures comes great responsibility,"{'toxicity': 0.00060104684, 'severe_toxicity':..."


### Restructuration du DataFrame

In [101]:
comments_df['toxicity'] = comments_df['toxicity_score'].apply(lambda x: x['toxicity'])
comments_df['severe_toxicity'] = comments_df['toxicity_score'].apply(lambda x: x['severe_toxicity'])
comments_df['obscene'] = comments_df['toxicity_score'].apply(lambda x: x['obscene'])
comments_df['threat'] = comments_df['toxicity_score'].apply(lambda x: x['threat'])
comments_df['insult'] = comments_df['toxicity_score'].apply(lambda x: x['insult'])
comments_df['identity_attack'] = comments_df['toxicity_score'].apply(lambda x: x['identity_attack'])



comments_df.drop(columns=['toxicity_score'], inplace=True)

In [88]:
comments_df

Unnamed: 0,id,body,postId,user_id,username,clean_text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack
0,1,This is some awesome thinking!,100,63,eburras1q,This is some awesome thinking,0.000760,0.000112,0.000195,0.000110,0.000170,0.000133
1,2,What terrific math skills you’re showing!,27,71,omarsland1y,What terrific math skills youre showing,0.007488,0.000083,0.000288,0.000123,0.000430,0.000199
2,3,You are an amazing writer!,61,29,jissetts,You are an amazing writer,0.000634,0.000124,0.000188,0.000135,0.000175,0.000137
3,4,Wow! You have improved so much!,8,19,bleveragei,Wow You have improved so much,0.000949,0.000105,0.000170,0.000125,0.000191,0.000140
4,5,Nice idea!,62,70,cmasurel1x,Nice idea,0.000654,0.000117,0.000187,0.000113,0.000176,0.000137
...,...,...,...,...,...,...,...,...,...,...,...,...
335,336,"I checked the clouds during midnight, and wond...",73,91,hyaknov2i,I checked the clouds during midnight and wonde...,0.000693,0.000113,0.000174,0.000123,0.000173,0.000137
336,337,"This is such an Insta beauty, why waste it by ...",79,77,rkingswood24,This is such an Insta beauty why waste it by c...,0.013269,0.000100,0.000508,0.000147,0.000501,0.000240
337,338,Have we met before? If not – would you like to...,91,31,smargiottau,Have we met before If not would you like to g...,0.000885,0.000108,0.000166,0.000121,0.000177,0.000149
338,339,"With great pictures, comes great responsibility.",4,89,cdwyr2g,With great pictures comes great responsibility,0.000601,0.000125,0.000179,0.000132,0.000175,0.000142


## Charger les données dans MongoDB

In [102]:
import pymongo

mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_db = mongo_client["dummyjson"]
mongo_collection = mongo_db["commentaires"]
comment_docs = comments_df.to_dict(orient='records')
mongo_collection.insert_many(comment_docs)

InsertManyResult([ObjectId('6613f2701fbbc780957eb7b9'), ObjectId('6613f2701fbbc780957eb7ba'), ObjectId('6613f2701fbbc780957eb7bb'), ObjectId('6613f2701fbbc780957eb7bc'), ObjectId('6613f2701fbbc780957eb7bd'), ObjectId('6613f2701fbbc780957eb7be'), ObjectId('6613f2701fbbc780957eb7bf'), ObjectId('6613f2701fbbc780957eb7c0'), ObjectId('6613f2701fbbc780957eb7c1'), ObjectId('6613f2701fbbc780957eb7c2'), ObjectId('6613f2701fbbc780957eb7c3'), ObjectId('6613f2701fbbc780957eb7c4'), ObjectId('6613f2701fbbc780957eb7c5'), ObjectId('6613f2701fbbc780957eb7c6'), ObjectId('6613f2701fbbc780957eb7c7'), ObjectId('6613f2701fbbc780957eb7c8'), ObjectId('6613f2701fbbc780957eb7c9'), ObjectId('6613f2701fbbc780957eb7ca'), ObjectId('6613f2701fbbc780957eb7cb'), ObjectId('6613f2701fbbc780957eb7cc'), ObjectId('6613f2701fbbc780957eb7cd'), ObjectId('6613f2701fbbc780957eb7ce'), ObjectId('6613f2701fbbc780957eb7cf'), ObjectId('6613f2701fbbc780957eb7d0'), ObjectId('6613f2701fbbc780957eb7d1'), ObjectId('6613f2701fbbc780957eb7