In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JSON Reader").getOrCreate()
df = spark.read.json("Data.json")

In [2]:
df.printSchema() 

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [3]:
df.show(5)  

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [4]:
train, rest = df.randomSplit([0.8, 0.2], seed=42)
val, test = rest.randomSplit([0.5, 0.5], seed=42)
val =val.select("asin", "reviewText","reviewTime","reviewerName")

In [6]:
val.show(5)

+----------+--------------------+-----------+--------------------+
|      asin|          reviewText| reviewTime|        reviewerName|
+----------+--------------------+-----------+--------------------+
|B00005ML71|I got it to have ...|04 22, 2014|       Christopher C|
|B000068NVI|I've used a lot o...|09 17, 2013|R. Wristen "The P...|
|B000068NW5|I am not hard on ...| 06 8, 2013|           Dr. Freud|
|B000068NW5|Bought this for m...| 03 5, 2014|            C. Zemer|
|B000068NW5|This is good cabl...| 10 8, 2013|     grandpa "Randy"|
+----------+--------------------+-----------+--------------------+
only showing top 5 rows



In [5]:
import shutil
import os

# 1. Réduire à une seule partition
val_single = val.coalesce(1)

# 2. Sauvegarde dans un dossier temporaire
temp_path = "temp_validation_output"
val_single.write.mode("overwrite").json(temp_path)

# 3. Trouver le fichier JSON généré par Spark
for file_name in os.listdir(temp_path):
    if file_name.endswith(".json"):
        source_file = os.path.join(temp_path, file_name)
        break

# 4. Copier et renommer ce fichier en "data_validation.json"
shutil.move(source_file, "data_validation.json")

# 5. Supprimer le dossier temporaire
shutil.rmtree(temp_path)

print("Fichier sauvegardé : data_validation.json")

Fichier sauvegardé : data_validation.json


In [4]:
from pyspark.sql.functions import col

df = df.select("reviewText", "overall")
df = df.na.drop(subset=["reviewText", "overall"])

In [5]:
df.show(5)  

+--------------------+-------+
|          reviewText|overall|
+--------------------+-------+
|Not much to write...|    5.0|
|The product does ...|    5.0|
|The primary job o...|    5.0|
|Nice windscreen p...|    5.0|
|This pop filter i...|    5.0|
+--------------------+-------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import when, col

df = df.withColumn(
    "label",
    when(col("overall") < 3, 0)
    .when(col("overall") == 3, 1)
    .otherwise(2)
)

In [9]:
df.show(5)

+--------------------+-------+-----+
|          reviewText|overall|label|
+--------------------+-------+-----+
|Not much to write...|    5.0|    2|
|The product does ...|    5.0|    2|
|The primary job o...|    5.0|    2|
|Nice windscreen p...|    5.0|    2|
|This pop filter i...|    5.0|    2|
+--------------------+-------+-----+
only showing top 5 rows



In [12]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.0-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m796.9/796.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.2.0-py3-none-any.whl (102 kB)
Downloading joblib-1.5.0-py3-none-any.whl (307 kB)
Downloading tqd

In [13]:
import re
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

clean_text_udf = udf(clean_text, StringType())

df = df.withColumn("clean_text", clean_text_udf(col("reviewText")))


[nltk_data] Downloading package stopwords to /nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /nltk_data...


In [14]:
df.show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+-------+-----+--------------------+
|          reviewText|overall|label|          clean_text|
+--------------------+-------+-----+--------------------+
|Not much to write...|    5.0|    2|much write exactl...|
|The product does ...|    5.0|    2|product exactly q...|
|The primary job o...|    5.0|    2|primary job devic...|
|Nice windscreen p...|    5.0|    2|nice windscreen p...|
|This pop filter i...|    5.0|    2|pop filter great ...|
+--------------------+-------+-----+--------------------+
only showing top 5 rows



                                                                                

In [16]:
pip install numpy 

Collecting numpy
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.2.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="clean_text", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=5000)

# 4. IDF : calcule le TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [18]:
words_data = tokenizer.transform(df)
filtered_data = remover.transform(words_data)
featurized_data = hashingTF.transform(filtered_data)
idf_model = idf.fit(featurized_data)
df_tfidf = idf_model.transform(featurized_data)

                                                                                

In [19]:
train, rest = df_tfidf.randomSplit([0.8, 0.2], seed=42)
val, test = rest.randomSplit([0.5, 0.5], seed=42)

In [20]:
import shutil
import os

# 1. Réduire à une seule partition
val_single = val.coalesce(1)

# 2. Sauvegarde dans un dossier temporaire
temp_path = "temp_validation_output"
val_single.write.mode("overwrite").json(temp_path)

# 3. Trouver le fichier JSON généré par Spark
for file_name in os.listdir(temp_path):
    if file_name.endswith(".json"):
        source_file = os.path.join(temp_path, file_name)
        break

# 4. Copier et renommer ce fichier en "data_validation.json"
shutil.move(source_file, "data_validation.json")

# 5. Supprimer le dossier temporaire
shutil.rmtree(temp_path)

print("✅ Fichier sauvegardé : data_validation.json")


                                                                                

✅ Fichier sauvegardé : data_validation.json


In [21]:
from pyspark.sql.functions import col, rand

print("Avant sur-échantillonnage (train) :")
train.groupBy("label").count().show()

counts = train.groupBy("label").count().collect()
count_dict = {row["label"]: row["count"] for row in counts}

max_count = max(count_dict.values())

resampled = None

for label_val, count in count_dict.items():
    subset = train.filter(col("label") == label_val)
    if count < max_count:
        ratio = max_count / count
        sampled_subset = subset.sample(withReplacement=True, fraction=ratio)
    else:
        sampled_subset = subset

    if resampled is None:
        resampled = sampled_subset
    else:
        resampled = resampled.union(sampled_subset)

resampled = resampled.orderBy(rand())

print("Après sur-échantillonnage (train) :")
resampled.groupBy("label").count().show()


Avant sur-échantillonnage (train) :


                                                                                

+-----+-----+
|label|count|
+-----+-----+
|    1|  630|
|    2| 7266|
|    0|  378|
+-----+-----+



                                                                                

Après sur-échantillonnage (train) :




+-----+-----+
|label|count|
+-----+-----+
|    1| 7151|
|    2| 7266|
|    0| 7347|
+-----+-----+



                                                                                

In [131]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")

# Entraîner le modèle sur la base d'entraînement équilibrée
lr_model = lr.fit(resampled)

# Faire des prédictions sur la base de test
predictions = lr_model.transform(test)

# Afficher quelques résultats de la prédiction
predictions.select("features", "label", "prediction", "probability").show(5)


[Stage 1554:>                                                       (0 + 1) / 1]

+--------------------+-----+----------+--------------------+
|            features|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|(5000,[19,29,55,1...|    1|       0.0|[0.84093359094043...|
|(5000,[68,77,78,1...|    2|       1.0|[1.13465150493717...|
|(5000,[40,133,157...|    2|       2.0|[6.93480851034739...|
|(5000,[5,122,594,...|    2|       1.0|[3.35821641319260...|
|(5000,[12,34,117,...|    0|       2.0|[5.45597832752086...|
+--------------------+-----+----------+--------------------+
only showing top 5 rows



                                                                                

In [132]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision (pondérée): {precision:.4f}")
print(f"Recall (pondéré): {recall:.4f}")




Accuracy: 0.7933
F1 Score: 0.8037
Precision (pondérée): 0.8151
Recall (pondéré): 0.7933


                                                                                

In [133]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Prédictions
predictions = lr_model.transform(test)

# Matrice de confusion
confusion_df = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
confusion_df.show()




+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|    9|
|    0|       1.0|    5|
|    0|       2.0|   28|
|    1|       0.0|    6|
|    1|       1.0|   13|
|    1|       2.0|   54|
|    2|       0.0|   29|
|    2|       1.0|   81|
|    2|       2.0|  757|
+-----+----------+-----+



                                                                                

In [134]:
tokenizer = Tokenizer(inputCol="clean_text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=5000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Création du pipeline complet
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

In [135]:
for colname in ["words", "filtered_words", "rawFeatures", "features"]:
    if colname in resampled.columns:
        resampled = resampled.drop(colname)


In [136]:
pipeline_model = pipeline.fit(resampled) 



In [138]:
pipeline_model.save("review_sentiment")

                                                                                