In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
import os

ModuleNotFoundError: No module named 'numpy'

# 1. Start Spark session

In [None]:
spark = SparkSession.builder \
    .appName("Amazon Sentiment Classification") \
    .getOrCreate()


# 2. Load JSON Lines data (each line = 1 JSON object)

In [None]:
data_path = "../Data/train_data.json"
df = spark.read.json(data_path)

# 3. Select required fields

In [None]:
df = df.select("reviewText", "overall")

# 4. Map 'overall' to sentiment label (0: Negative, 1: Neutral, 2: Positive)

In [None]:
df = df.withColumn("label", when(col("overall") < 3, 0)
                             .when(col("overall") == 3, 1)
                             .otherwise(2))

# 5. Drop rows with nulls

In [None]:
df = df.dropna(subset=["reviewText", "label"])

# 6. Text preprocessing pipeline

In [None]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# 7. Logistic Regression

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=20)

# 8. Pipeline

In [None]:
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

# 9. Train model

In [None]:
model = pipeline.fit(df)

# 10. Save model

In [None]:
model_path = "../Models/SentimentModel_v0"
if os.path.exists(model_path):
    import shutil
    shutil.rmtree(model_path)
model.write().overwrite().save(model_path)

print("✅ Model trained and saved successfully!")

# 11. Stop Spark

In [None]:
spark.stop()