In [14]:
import pandas as pd
import numpy as np
import spacy
import re
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [15]:
# Make sure the data directory exists
os.makedirs("../data", exist_ok=True)

print("Loading data...")
# Charger les données
data = pd.read_json("../data/reviews.json", lines=True)


Loading data...


In [16]:
# Afficher des informations sur les données
print(f"Données chargées: {data.shape[0]} critiques")
print("\nAperçu des données:")
print(data.head(2))

print("\nTypes des colonnes:")
print(data.dtypes)

print("\nStatistiques sur les notes (overall):")
print(data['overall'].describe())
print("\nDistribution des notes:")
print(data['overall'].value_counts().sort_index())

print("\nVérification des valeurs nulles:")
print(data.isnull().sum())


Données chargées: 10261 critiques

Aperçu des données:
       reviewerID        asin  \
0  A2IBPI20UZIR0U  1384719342   
1  A14VAT5EAX3D9S  1384719342   

                                       reviewerName   helpful  \
0  cassandra tu "Yeah, well, that's just like, u...    [0, 0]   
1                                              Jake  [13, 14]   

                                          reviewText  overall summary  \
0  Not much to write about here, but it does exac...        5    good   
1  The product does exactly as it should and is q...        5    Jake   

   unixReviewTime   reviewTime  
0      1393545600  02 28, 2014  
1      1363392000  03 16, 2013  

Types des colonnes:
reviewerID        object
asin              object
reviewerName      object
helpful           object
reviewText        object
overall            int64
summary           object
unixReviewTime     int64
reviewTime        object
dtype: object

Statistiques sur les notes (overall):
count    10261.000000
mean     

In [17]:
# Nettoyage du texte
print("\nNettoyage du texte...")
def clean_text(text):
    # Convert to string in case we have non-string inputs
    text = str(text)
    # Remove non-alphabetic characters (except spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['cleaned_text'] = data['reviewText'].apply(clean_text)



Nettoyage du texte...


In [18]:
# Check for empty text after cleaning
empty_text_count = (data['cleaned_text'] == '').sum()
print(f"Textes vides après nettoyage: {empty_text_count}")
if empty_text_count > 0:
    # Replace empty text with a placeholder
    data['cleaned_text'] = data['cleaned_text'].replace('', 'no text available')


Textes vides après nettoyage: 7


In [19]:
# Lemmatisation avec spaCy
print("\nChargement du modèle spaCy...")
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Téléchargement du modèle spaCy...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

print("Lemmatisation des textes...")
def lemmatize(text):
    # Limit text length for processing efficiency
    text = text[:100000]  # Limit to first 100K chars to avoid memory issues
    doc = nlp(text)
    # Get lemmas for tokens that aren't stop words
    lemmas = [token.lemma_ for token in doc if not token.is_stop]
    if not lemmas:  # If all tokens were stop words
        return text  # Return original cleaned text
    return ' '.join(lemmas)



Chargement du modèle spaCy...
Lemmatisation des textes...


In [20]:
# Apply lemmatization with error handling
print("Application de la lemmatisation...")
lemmatized_texts = []
for i, text in enumerate(data['cleaned_text']):
    try:
        lemmatized = lemmatize(text)
        lemmatized_texts.append(lemmatized)
        if i % 1000 == 0:
            print(f"Traitement: {i}/{len(data)}")
    except Exception as e:
        print(f"Erreur lors de la lemmatisation de l'index {i}: {e}")
        lemmatized_texts.append(text)  # Use cleaned text as fallback

data['lemmatized_text'] = lemmatized_texts


Application de la lemmatisation...
Traitement: 0/10261
Traitement: 1000/10261
Traitement: 2000/10261
Traitement: 3000/10261
Traitement: 4000/10261
Traitement: 5000/10261
Traitement: 6000/10261
Traitement: 7000/10261
Traitement: 8000/10261
Traitement: 9000/10261
Traitement: 10000/10261


In [21]:
# Définir la classe cible
print("\nDéfinition des classes de sentiment...")
def label(overall):
    # Ensure overall is a number
    try:
        overall = float(overall)
    except (ValueError, TypeError):
        return None  # Return None for invalid values
        
    if overall < 3:
        return 0  # négatif
    elif overall == 3:
        return 1  # neutre
    else:
        return 2  # positif

data['label'] = data['overall'].apply(label)



Définition des classes de sentiment...


In [22]:
# Verify there are no None values in label
null_labels = data['label'].isnull().sum()
if null_labels > 0:
    print(f"ATTENTION: {null_labels} étiquettes nulles trouvées!")
    print("Suppression des lignes avec des étiquettes nulles...")
    data = data.dropna(subset=['label'])


In [23]:
# Ensure label is integer type
data['label'] = data['label'].astype(int)

print("\nDistribution des classes de sentiment:")
print(data['label'].value_counts().sort_index())


Distribution des classes de sentiment:
label
0     467
1     772
2    9022
Name: count, dtype: int64


In [24]:
# Vectorisation TF-IDF
print("\nVectorisation TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['lemmatized_text'])

print(f"Forme du corpus vectorisé: {X.shape}")



Vectorisation TF-IDF...
Forme du corpus vectorisé: (10261, 5000)


In [26]:
# Sauvegarde pour entraînement futur
print("\nDécoupage des données en train/val/test...")
y = data['label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

print(f"Ensemble d'entraînement: {X_train.shape[0]} échantillons")
print(f"Ensemble de validation: {X_val.shape[0]} échantillons")
print(f"Ensemble de test: {X_test.shape[0]} échantillons")

# Sauvegardes
print("\nSauvegarde des données traitées...")

# Sauvegarde pour PySpark/ML
print("Sauvegarde en CSV (pour Spark)...")
# Select columns needed for Spark ML
columns_for_spark = ['reviewerID', 'overall', 'lemmatized_text', 'label']
data_for_spark = data[columns_for_spark].copy()



Découpage des données en train/val/test...
Ensemble d'entraînement: 8208 échantillons
Ensemble de validation: 1026 échantillons
Ensemble de test: 1027 échantillons

Sauvegarde des données traitées...
Sauvegarde en CSV (pour Spark)...


In [27]:
# Final verification before saving
print("\nVérification finale avant sauvegarde:")
print("Types des colonnes:")
print(data_for_spark.dtypes)
print("\nAperçu des données à sauvegarder:")
print(data_for_spark.head())



Vérification finale avant sauvegarde:
Types des colonnes:
reviewerID         object
overall             int64
lemmatized_text    object
label               int64
dtype: object

Aperçu des données à sauvegarder:
       reviewerID  overall                                    lemmatized_text  \
0  A2IBPI20UZIR0U        5  write exactly suppose filters pop sound record...   
1  A14VAT5EAX3D9S        5  product exactly affordablei realize double scr...   
2  A195EZSQDW3E21        5  primary job device block breath produce poppin...   
3  A2C00NNG1ZQQG2        5  nice windscreen protect mxl mic prevent pop th...   
4   A94QU4C90B1AX        5  pop filter great look perform like studio filt...   

   label  
0      2  
1      2  
2      2  
3      2  
4      2  


In [28]:
# Save to CSV
data_for_spark.to_csv("../data/cleaned_reviews.csv", index=False)

# Save vectorizer model
print("Sauvegarde du vectoriseur...")
joblib.dump(vectorizer, "../data/vectorizer.pkl")

# Save spaCy model
print("Sauvegarde du modèle spaCy...")
nlp.to_disk("../data/spacy_model")

# Save NumPy arrays
print("Sauvegarde des matrices NumPy...")
np.savez("../data/processed_data.npz", 
         X_train=X_train.toarray(), 
         y_train=y_train,
         X_val=X_val.toarray(), 
         y_val=y_val, 
         X_test=X_test.toarray(), 
         y_test=y_test)


print("\nVérification finale avant sauvegarde:")
print("Types des colonnes:")
print(data_for_spark.dtypes)
print("\nAperçu des données à sauvegarder:")
print(data_for_spark.head())

print("\nTraitement terminé. Données sauvegardées dans le dossier ../data/")

Sauvegarde du vectoriseur...
Sauvegarde du modèle spaCy...
Sauvegarde des matrices NumPy...

Vérification finale avant sauvegarde:
Types des colonnes:
reviewerID         object
overall             int64
lemmatized_text    object
label               int64
dtype: object

Aperçu des données à sauvegarder:
       reviewerID  overall                                    lemmatized_text  \
0  A2IBPI20UZIR0U        5  write exactly suppose filters pop sound record...   
1  A14VAT5EAX3D9S        5  product exactly affordablei realize double scr...   
2  A195EZSQDW3E21        5  primary job device block breath produce poppin...   
3  A2C00NNG1ZQQG2        5  nice windscreen protect mxl mic prevent pop th...   
4   A94QU4C90B1AX        5  pop filter great look perform like studio filt...   

   label  
0      2  
1      2  
2      2  
3      2  
4      2  

Traitement terminé. Données sauvegardées dans le dossier ../data/
