In [70]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# Step 1: Initialize SparkSession
spark = SparkSession.builder.appName("YTSentAnal2").getOrCreate()

# Step 2: Define Dataset Paths
file_paths = {
    "LoganPaul": "LoganPaul.csv",
    "OKGO": "OKGO.csv",
    "RoyalWedding": "RoyalWedding.csv",
    "TaylorSwift": "TaylorSwift.csv",
    "Trump": "trump.csv",
}


In [71]:
# Load and clean data
def load_and_clean(file_path, delimiter=","):
    schema = StructType([
        StructField("label", IntegerType(), True),
        StructField("text", StringType(), True)
    ])
    df = spark.read.option("header", "false").option("sep", delimiter).schema(schema).csv(file_path)
    return df.filter((col("text").isNotNull()) & (col("label").isNotNull()))

datasets = [load_and_clean(file_path, delimiter=";" if name == "OKGO" else ",") for name, file_path in file_paths.items()]

# Combine all datasets into one DataFrame
combined_df = spark.createDataFrame([], schema=datasets[0].schema)
for df in datasets:
    combined_df = combined_df.union(df)

# Verify label column distribution
combined_df.groupBy("label").count().show()

# Check schema to confirm proper column names
combined_df.printSchema()

+-----+-----+
|label|count|
+-----+-----+
|   -1|  780|
|    1|  818|
|    0| 1238|
+-----+-----+

root
 |-- label: integer (nullable = true)
 |-- text: string (nullable = true)


                                                                                

In [72]:
# Text Normalization with NLTK
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Define preprocessing UDF
def preprocess_text(tokens):
    processed_tokens = []
    for word in tokens:
        word = word.lower()
        word = re.sub(r'[^\w\s]', '', word)  # Remove special characters
        word = re.sub(r'\d+', '', word)  # Remove digits
        if word not in stop_words:  # Remove stopwords
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatize word
            processed_tokens.append(lemmatized_word)
    return processed_tokens

preprocess_udf = udf(preprocess_text, ArrayType(StringType()))

# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenized_df = tokenizer.transform(combined_df)

# Apply UDF for normalization and lemmatization
normalized_df = tokenized_df.withColumn(
    "filtered_tokens", preprocess_udf(col("tokens"))
)

# Filter out empty tokenized rows
normalized_df = normalized_df.filter(col("filtered_tokens").isNotNull())

# Select only required columns
refined_preprocessed_df = normalized_df.select("filtered_tokens", "label")

# Verify the updated normalization
refined_preprocessed_df.printSchema()
refined_preprocessed_df.show(5, truncate=False)

[nltk_data] Downloading package wordnet to /Users/ammaar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ammaar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammaar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


root
 |-- filtered_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: integer (nullable = true)




+--------------------------------------------------------------------------------------------------------------------------------------+-----+
|filtered_tokens                                                                                                                       |label|
+--------------------------------------------------------------------------------------------------------------------------------------+-----+
|[wow, heard, guy, easily, insecure, douche, ever, seen, youtube, clearly, mental, issue, need, evaluated, give, guy, help, need, asap]|1    |
|[japanese, trying, respectful, lo, gan, logan, care, wtf]                                                                             |-1   |
|[prick]                                                                                                                               |-1   |
|[think, weed, cry]                                                                                                                    |-1   |

                                                                                

In [73]:
# Class Weighting
from pyspark.sql.functions import when

# Add weights to the dataset
class_weights = {
    -1: 1.0 / 780,  
    0: 1.0 / 818,
    1: 1.0 / 1238
}

# Add weights to the dataset
balanced_training_data = refined_preprocessed_df.withColumn(
    "weight",
    when(col("label") == -1, class_weights[-1])
    .when(col("label") == 0, class_weights[0])
    .when(col("label") == 1, class_weights[1])
)

# Verify the added weights
balanced_training_data.groupBy("label").agg({"weight": "avg"}).show()



+-----+--------------------+
|label|         avg(weight)|
+-----+--------------------+
|   -1|0.001282051282051...|
|    1|8.077544426494289E-4|
|    0|0.001222493887530...|
+-----+--------------------+


                                                                                

In [74]:
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml import Pipeline

count_vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="raw_features", vocabSize=10000)

# Apply IDF for scaling feature vectors
idf = IDF(inputCol="raw_features", outputCol="features")

# Create a feature extraction pipeline
feature_pipeline = Pipeline(stages=[count_vectorizer, idf])

# Fit and transform the pipeline on the weighted dataset
feature_model = feature_pipeline.fit(balanced_training_data)
featured_df = feature_model.transform(balanced_training_data)

# Select necessary columns for training
final_training_data = featured_df.select("features", "label", "weight")

# Verify the resulting dataset
final_training_data.printSchema()
final_training_data.show(5, truncate=False)

                                                                                

root
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)
 |-- weight: double (nullable = true)




+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                       |label|weight               |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [75]:
# Label Remapping and Data Splitting

from pyspark.sql.functions import when

# Remap labels: -1 -> 0, 0 -> 1, 1 -> 2 for ML since -1 cannot b used 4 training
final_training_data = final_training_data.withColumn(
    "label",
    when(col("label") == -1, 0)
    .when(col("label") == 0, 1)
    .when(col("label") == 1, 2)
)


In [77]:
# Log regression model training

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define Logistic Regression model with weights
lr = LogisticRegression(featuresCol="features", labelCol="label", weightCol="weight", maxIter=20)

# Define parameter grid for tuning
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Initialize k-fold CrossValidator
cross_validator = CrossValidator(
    estimator=lr,
    estimatorParamMaps=param_grid,
    evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy"),
    numFolds=5  # 5-fold cross-validation
)

# Train Logistic Regression with cross-validation
cv_model = cross_validator.fit(final_training_data)

# Extract the best model
best_model = cv_model.bestModel
print(f"Best Model Parameters: RegParam={best_model._java_obj.getRegParam()}, ElasticNetParam={best_model._java_obj.getElasticNetParam()}")

# Save the best model for reuse
best_model.write().overwrite().save("logistic_regression_best_model")

                                                                                

Best Model Parameters: RegParam=0.01, ElasticNetParam=0.5


In [78]:
# Evaluate the best model on the entire dataset
test_predictions = best_model.transform(final_training_data)

# Evaluate test accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
test_accuracy = evaluator.evaluate(test_predictions)
print(f"Cross-Validated Test Accuracy: {test_accuracy}")

# Evaluate precision, recall, and F1-score for each class
labels = [0, 1, 2]  
for label in labels:
    precision = evaluator.evaluate(test_predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: label})
    recall = evaluator.evaluate(test_predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: label})
    f1 = evaluator.evaluate(test_predictions, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: label})
    print(f"Class {label}: Precision = {precision}, Recall = {recall}, F1-Score = {f1}")

                                                                                

Cross-Validated Test Accuracy: 0.900916784203103


                                                                                

Class 0: Precision = 0.9605633802816902, Recall = 0.8743589743589744, F1-Score = 0.9154362416107382


                                                                                

Class 1: Precision = 0.8342541436464088, Recall = 0.975767366720517, F1-Score = 0.8994787788533134




Class 2: Precision = 0.9808259587020649, Recall = 0.812958435207824, F1-Score = 0.8890374331550803


                                                                                

In [79]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Create a confusion matrix DataFrame
confusion_matrix_df = (
    test_predictions
    .groupBy("label", "prediction")
    .agg(F.count("*").alias("count"))
    .orderBy("label", "prediction")
)

# Collect the confusion matrix for display
confusion_matrix = confusion_matrix_df.collect()

# Print confusion matrix
print("Confusion Matrix:")
for row in confusion_matrix:
    print(f"Label {int(row['label'])} Predicted as {int(row['prediction'])}: {row['count']}")

                                                                                

Confusion Matrix:
Label 0 Predicted as 0: 682
Label 0 Predicted as 1: 95
Label 0 Predicted as 2: 3
Label 1 Predicted as 0: 20
Label 1 Predicted as 1: 1208
Label 1 Predicted as 2: 10
Label 2 Predicted as 0: 8
Label 2 Predicted as 1: 145
Label 2 Predicted as 2: 665
