In [82]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JSON Reader").getOrCreate()
df = spark.read.json("Data.json")


In [83]:
df.printSchema() 

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [84]:
df.show(5)  

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [85]:
from pyspark.sql.functions import col

df = df.select("reviewText", "overall")
df = df.na.drop(subset=["reviewText", "overall"])

In [86]:
df.show(5)  

+--------------------+-------+
|          reviewText|overall|
+--------------------+-------+
|Not much to write...|    5.0|
|The product does ...|    5.0|
|The primary job o...|    5.0|
|Nice windscreen p...|    5.0|
|This pop filter i...|    5.0|
+--------------------+-------+
only showing top 5 rows



In [87]:
from pyspark.sql.functions import when, col

df = df.withColumn(
    "label",
    when(col("overall") < 3, 0)
    .when(col("overall") == 3, 1)
    .otherwise(2)
)

In [88]:
df.show(5)

+--------------------+-------+-----+
|          reviewText|overall|label|
+--------------------+-------+-----+
|Not much to write...|    5.0|    2|
|The product does ...|    5.0|    2|
|The primary job o...|    5.0|    2|
|Nice windscreen p...|    5.0|    2|
|This pop filter i...|    5.0|    2|
+--------------------+-------+-----+
only showing top 5 rows



In [89]:
import re
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

clean_text_udf = udf(clean_text, StringType())

df = df.withColumn("clean_text", clean_text_udf(col("reviewText")))


[nltk_data] Downloading package stopwords to /nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [70]:
df.show(5)

[Stage 830:>                                                        (0 + 1) / 1]

+--------------------+-------+-----+--------------------+
|          reviewText|overall|label|          clean_text|
+--------------------+-------+-----+--------------------+
|Not much to write...|    5.0|    2|much write exactl...|
|The product does ...|    5.0|    2|product exactly q...|
|The primary job o...|    5.0|    2|primary job devic...|
|Nice windscreen p...|    5.0|    2|nice windscreen p...|
|This pop filter i...|    5.0|    2|pop filter great ...|
+--------------------+-------+-----+--------------------+
only showing top 5 rows



                                                                                

In [71]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="clean_text", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=5000)

# 4. IDF : calcule le TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [72]:
words_data = tokenizer.transform(df)
filtered_data = remover.transform(words_data)
featurized_data = hashingTF.transform(filtered_data)
idf_model = idf.fit(featurized_data)
df_tfidf = idf_model.transform(featurized_data)

                                                                                

In [73]:
train_full, test = df_tfidf.randomSplit([0.8, 0.2], seed=42)
train, val = train_full.randomSplit([0.875, 0.125], seed=42)

In [74]:
from pyspark.sql.functions import col, rand

print("Avant sur-échantillonnage (train) :")
train.groupBy("label").count().show()

counts = train.groupBy("label").count().collect()
count_dict = {row["label"]: row["count"] for row in counts}

max_count = max(count_dict.values())

resampled = None

for label_val, count in count_dict.items():
    subset = train.filter(col("label") == label_val)
    if count < max_count:
        ratio = max_count / count
        sampled_subset = subset.sample(withReplacement=True, fraction=ratio)
    else:
        sampled_subset = subset

    if resampled is None:
        resampled = sampled_subset
    else:
        resampled = resampled.union(sampled_subset)

resampled = resampled.orderBy(rand())

print("Après sur-échantillonnage (train) :")
resampled.groupBy("label").count().show()


Avant sur-échantillonnage (train) :


                                                                                

+-----+-----+
|label|count|
+-----+-----+
|    1|  552|
|    2| 6385|
|    0|  334|
+-----+-----+



                                                                                

Après sur-échantillonnage (train) :




+-----+-----+
|label|count|
+-----+-----+
|    1| 6413|
|    2| 6385|
|    0| 6458|
+-----+-----+



                                                                                

In [75]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")

# Entraîner le modèle sur la base d'entraînement équilibrée
lr_model = lr.fit(resampled)

# Faire des prédictions sur la base de test
predictions = lr_model.transform(test)

# Afficher quelques résultats de la prédiction
predictions.select("features", "label", "prediction", "probability").show(5)


[Stage 988:>                                                        (0 + 1) / 1]

+--------------------+-----+----------+--------------------+
|            features|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|(5000,[19,29,55,1...|    1|       0.0|[1.0,2.7448151071...|
|(5000,[68,77,78,1...|    2|       2.0|[1.08523761190019...|
|(5000,[40,133,157...|    2|       2.0|[4.92412163579333...|
|(5000,[133,222,28...|    1|       2.0|[3.64215306091366...|
|(5000,[5,122,594,...|    2|       1.0|[4.20773674635239...|
+--------------------+-----+----------+--------------------+
only showing top 5 rows



                                                                                

In [76]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision (pondérée): {precision:.4f}")
print(f"Recall (pondéré): {recall:.4f}")




Accuracy: 0.8088
F1 Score: 0.8149
Precision (pondérée): 0.8220
Recall (pondéré): 0.8088


                                                                                

In [77]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Prédictions
predictions = lr_model.transform(test)

# Matrice de confusion
confusion_df = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
confusion_df.show()




+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|   12|
|    0|       1.0|   18|
|    0|       2.0|   59|
|    1|       0.0|   10|
|    1|       1.0|   33|
|    1|       2.0|   99|
|    2|       0.0|   53|
|    2|       1.0|  141|
|    2|       2.0| 1562|
+-----+----------+-----+



                                                                                

In [90]:
tokenizer = Tokenizer(inputCol="clean_text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=5000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Création du pipeline complet
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

In [92]:
for colname in ["words", "filtered_words", "rawFeatures", "features"]:
    if colname in resampled.columns:
        resampled = resampled.drop(colname)


In [93]:
pipeline_model = pipeline.fit(resampled) 



In [95]:
pipeline_model.save("review_sentiment_pipeline")