In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import re

In [21]:
# Initialize Spark Session with optimized settings
spark = SparkSession.builder \
    .appName("Amazon Reviews Sentiment Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.python.worker.memory", "512m") \
    .config("spark.python.worker.timeout", "600") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .getOrCreate()

In [22]:
# Load the data
data_path = "../data/raw/Data.json"
reviews_df = spark.read.json(data_path)

In [23]:
# Create sentiment label based on overall rating
reviews_df = reviews_df.withColumn(
    "sentiment",
    when(col("overall") < 3, 0)  # 0 for negative
    .when(col("overall") == 3, 1)  # 1 for neutral
    .otherwise(2)  # 2 for positive
)

In [24]:
# Basic data cleaning
reviews_df = reviews_df.filter(col("reviewText").isNotNull())
reviews_df = reviews_df.withColumn("reviewText", regexp_replace(col("reviewText"), "[^a-zA-Z\\s]", " "))
reviews_df = reviews_df.withColumn("reviewText", lower(col("reviewText")))

In [25]:
# Split data into training, validation, and test sets (80%, 10%, 10%)
train_df, temp_df = reviews_df.randomSplit([0.8, 0.2], seed=42)
val_df, test_df = temp_df.randomSplit([0.5, 0.5], seed=42)

In [26]:
# Create ML Pipeline for text processing
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", minDF=2.0)
idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=2)

# Model training pipeline
sentiment_indexer = StringIndexer(inputCol="sentiment", outputCol="label")

# Create models with balanced class weights
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0, family="multinomial")
rf = RandomForestClassifier(numTrees=100, maxDepth=5, seed=42, minInstancesPerNode=1)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [27]:
# Create pipelines for each model
lr_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, lr])
rf_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, rf])
nb_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, sentiment_indexer, nb])

In [28]:
# Train models
print("Training Logistic Regression...")
lr_model = lr_pipeline.fit(train_df)

print("Training Random Forest...")
rf_model = rf_pipeline.fit(train_df)

print("Training Naive Bayes...")
nb_model = nb_pipeline.fit(train_df)

Training Logistic Regression...
Training Random Forest...
Training Naive Bayes...


In [29]:
# Function to evaluate model
def evaluate_model(model, data, model_name):
    predictions = model.transform(data)
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Calculate class distribution
    class_dist = predictions.groupBy("label").count()
    print(f"\nClass distribution for {model_name}:")
    class_dist.show()

    metrics = {"accuracy": accuracy}
    print(f"{model_name} Accuracy: {accuracy}")

    return metrics, predictions

In [30]:
# Evaluate models on validation set
lr_metrics, lr_preds = evaluate_model(lr_model, val_df, "Logistic Regression")
rf_metrics, rf_preds = evaluate_model(rf_model, val_df, "Random Forest")
nb_metrics, nb_preds = evaluate_model(nb_model, val_df, "Naive Bayes")


Class distribution for Logistic Regression:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  880|
|  1.0|   78|
|  2.0|   47|
+-----+-----+

Logistic Regression Accuracy: 0.8706467661691543

Class distribution for Random Forest:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  880|
|  1.0|   78|
|  2.0|   47|
+-----+-----+

Random Forest Accuracy: 0.8756218905472637

Class distribution for Naive Bayes:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  880|
|  1.0|   78|
|  2.0|   47|
+-----+-----+

Naive Bayes Accuracy: 0.8119402985074626


In [31]:
# Select best model based on accuracy
models = {
    "Logistic Regression": (lr_metrics["accuracy"], lr_model),
    "Random Forest": (rf_metrics["accuracy"], rf_model),
    "Naive Bayes": (nb_metrics["accuracy"], nb_model)
}

# Initialize variables to track the best model
best_model_name = None
best_model_accuracy = float('-inf')  # Start with negative infinity to ensure any accuracy is higher
best_model = None

# Iterate through models to find the one with highest accuracy
for name, (accuracy, model) in models.items():
    if accuracy > best_model_accuracy:
        best_model_name = name
        best_model_accuracy = accuracy
        best_model = model

# Print the result
print(f"Best model: {best_model_name} with accuracy: {best_model_accuracy}")

Best model: Random Forest with accuracy: 0.8756218905472637


In [32]:
# Evaluate best model on test set
test_metrics, test_preds = evaluate_model(best_model, test_df, f"Best Model ({best_model_name})")


Class distribution for Best Model (Random Forest):
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  860|
|  1.0|   78|
|  2.0|   44|
+-----+-----+

Best Model (Random Forest) Accuracy: 0.8757637474541752


In [33]:
# Save the best model
best_model.write().overwrite().save("models/sentiment_model")

# Extract and save the TF-IDF model separately for use in streaming
tfidf_stages = best_model.stages[0:4]  # Tokenizer, StopWordsRemover, CountVectorizer, IDF
tfidf_pipeline = Pipeline(stages=tfidf_stages)
tfidf_model = tfidf_pipeline.fit(train_df)
tfidf_model.write().overwrite().save("models/tfidf_model")

In [34]:
# Save processed data
train_df.write.mode("overwrite").parquet("data/processed/train_data.parquet")
val_df.write.mode("overwrite").parquet("data/processed/val_data.parquet")
test_df.write.mode("overwrite").parquet("data/processed/test_data.parquet")

print("Model training and evaluation completed!")
spark.stop()

Model training and evaluation completed!


In [None]:
spark.stop()