In [None]:
import pandas as pd

In [None]:
df = pd.read_json("Data.json", lines=True)

In [None]:
# prompt: La classe cible (target) : (overall < 3 avis négatif ; overall = 3 avis neutre ; overall > 3 avis positif)
# make funcuin and applie it

def map_sentiment(overall):
  if overall < 3:
    return "negatif"
  elif overall == 3:
    return "neutre"
  else:
    return "positif"

df['sentiment'] = df['overall'].apply(map_sentiment)

print(df[['overall', 'sentiment']].head())

   overall sentiment
0        5   positif
1        5   positif
2        5   positif
3        5   positif
4        5   positif


In [None]:
import random
positif_indices = df[df['sentiment'] == 'positif'].index

indices_to_drop = random.sample(list(positif_indices), k=6900)

df = df.drop(indices_to_drop)

print(df['sentiment'].value_counts())

sentiment
positif    2122
neutre      772
negatif     467
Name: count, dtype: int64


In [None]:

df = df.drop(columns=['sentiment'])
df.to_json("Data_cleaned.json", orient='records', lines=True)

print("DataFrame without 'sentiment' column:")
print(df.head())
print("\nCleaned data saved to Data_cleaned.json")

DataFrame without 'sentiment' column:
        reviewerID        asin                   reviewerName helpful  \
2   A195EZSQDW3E21  1384719342  Rick Bennette "Rick Bennette"  [1, 1]   
4    A94QU4C90B1AX  1384719342                  SEAN MASLANKA  [0, 0]   
7    AJNFQI3YR6XJ5  B00004Y2UT              Fender Guy "Rick"  [0, 0]   
10  A2NYK9KWFMJV4Y  B00004Y2UT    Mike Tarrani "Jazz Drummer"  [6, 6]   
12  A2NIT6BKW11XJQ  B00005ML71                            Jai  [0, 0]   

                                           reviewText  overall  \
2   The primary job of this device is to block the...        5   
4   This pop filter is great. It looks and perform...        5   
7   I now use this cable to run from the output of...        3   
10  Monster makes a wide array of cables, includin...        5   
12  If you are not use to using a large sustaining...        3   

                                              summary  unixReviewTime  \
2                                It Does The Job Well

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import re

In [None]:
# Initialize Spark Session with optimized settings
spark = SparkSession.builder \
    .appName("Amazon Reviews Sentiment Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.python.worker.memory", "512m") \
    .config("spark.python.worker.timeout", "600") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .getOrCreate()

In [None]:
# Load the data
data_path = "Data_cleaned.json"
reviews_df = spark.read.json(data_path)

In [None]:
# Create sentiment label based on overall rating
reviews_df = reviews_df.withColumn(
    "sentiment",
    when(col("overall") < 3, 0)  # 0 for negative
    .when(col("overall") == 3, 1)  # 1 for neutral
    .otherwise(2)  # 2 for positive
)

In [None]:
# Basic data cleaning
reviews_df = reviews_df.filter(col("reviewText").isNotNull())
reviews_df = reviews_df.withColumn("reviewText", regexp_replace(col("reviewText"), "[^a-zA-Z\\s]", " "))
reviews_df = reviews_df.withColumn("reviewText", lower(col("reviewText")))

In [None]:
# Split data into training, validation, and test sets (80%, 10%, 10%)
train_df, temp_df = reviews_df.randomSplit([0.8, 0.2], seed=42)
val_df, test_df = temp_df.randomSplit([0.5, 0.5], seed=42)

In [None]:
# Create ML Pipeline for text processing
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", minDF=2.0)
idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=2)

# Model training pipeline
sentiment_indexer = StringIndexer(inputCol="sentiment", outputCol="label")

# Create models with balanced class weights
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
rf = RandomForestClassifier(numTrees=100, maxDepth=5, seed=42, minInstancesPerNode=1)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [None]:
# Create pipelines for each model
lr_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, lr])
rf_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, rf])
nb_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, nb])

In [None]:
# Train models
print("Training Logistic Regression...")
lr_model = lr_pipeline.fit(train_df)

print("Training Random Forest...")
rf_model = rf_pipeline.fit(train_df)

print("Training Naive Bayes...")
nb_model = nb_pipeline.fit(train_df)

Training Logistic Regression...
Training Random Forest...
Training Naive Bayes...


In [None]:
# Function to evaluate model
def evaluate_model(model, data, model_name):
    predictions = model.transform(data)
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Calculate class distribution
    class_dist = predictions.groupBy("label").count()
    print(f"\nClass distribution for {model_name}:")
    class_dist.show()

    metrics = {"accuracy": accuracy}
    print(f"{model_name} Accuracy: {accuracy}")

    return metrics, predictions

In [None]:
# Evaluate models on validation set
lr_metrics, lr_preds = evaluate_model(lr_model, val_df, "Logistic Regression")
rf_metrics, rf_preds = evaluate_model(rf_model, val_df, "Random Forest")
nb_metrics, nb_preds = evaluate_model(nb_model, val_df, "Naive Bayes")


Class distribution for Logistic Regression:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  210|
|  1.0|   75|
|  2.0|   41|
+-----+-----+

Logistic Regression Accuracy: 0.6441717791411042

Class distribution for Random Forest:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  210|
|  1.0|   75|
|  2.0|   41|
+-----+-----+

Random Forest Accuracy: 0.6441717791411042

Class distribution for Naive Bayes:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  210|
|  1.0|   75|
|  2.0|   41|
+-----+-----+

Naive Bayes Accuracy: 0.6196319018404908


In [None]:
# Select best model based on accuracy
models = {
    "Logistic Regression": (lr_metrics["accuracy"], lr_model),
    "Random Forest": (rf_metrics["accuracy"], rf_model),
    "Naive Bayes": (nb_metrics["accuracy"], nb_model)
}

# Initialize variables to track the best model
best_model_name = None
best_model_accuracy = float('-inf')  # Start with negative infinity to ensure any accuracy is higher
best_model = None

# Iterate through models to find the one with highest accuracy
for name, (accuracy, model) in models.items():
    if accuracy > best_model_accuracy:
        best_model_name = name
        best_model_accuracy = accuracy
        best_model = model

# Print the result
print(f"Best model: {best_model_name} with accuracy: {best_model_accuracy}")

Best model: Logistic Regression with accuracy: 0.6441717791411042


In [None]:
# Evaluate best model on test set
test_metrics, test_preds = evaluate_model(best_model, test_df, f"Best Model ({best_model_name})")


Class distribution for Best Model (Logistic Regression):
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  184|
|  1.0|   77|
|  2.0|   33|
+-----+-----+

Best Model (Logistic Regression) Accuracy: 0.6258503401360545


In [None]:
# Save the best model
best_model.write().overwrite().save("models/sentiment_model")

tfidf_stages = best_model.stages[0:4]
tfidf_pipeline = Pipeline(stages=tfidf_stages)
tfidf_model = tfidf_pipeline.fit(train_df)
tfidf_model.write().overwrite().save("models/tfidf_model")

In [None]:
# Save processed data
train_df.write.mode("overwrite").parquet("data/processed/train_data.parquet")
val_df.write.mode("overwrite").parquet("data/processed/val_data.parquet")
test_df.write.mode("overwrite").parquet("data/processed/test_data.parquet")

print("Model training and evaluation completed!")
spark.stop()

Model training and evaluation completed!


In [None]:
spark.stop()