# ML ANALYSIS


## Setup
We need an available Java installation to run pyspark. The easiest way to do this is to install JDK and set the proper paths using conda

In [None]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
# !pip install sparknlp

In [None]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark import SparkContext 
from pyspark.sql.functions import col, lower, count, length, unix_timestamp, current_timestamp, to_date, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import re
from datetime import datetime
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from pyspark.sql import functions as F
from scipy.stats import tstd
import nltk
import matplotlib.pyplot as plt
# !pip install plotly
# !pip install wordcloud
import plotly.express as px

# download the nltk stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk_stopwords = set(stopwords.words('english'))

import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import lit
from pyspark.sql import Window
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, HashingTF, StopWordsRemover
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, MapType, StringType
import string
from sparknlp.pretrained import PretrainedPipeline
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel

In [None]:
from pyspark.sql import SparkSession

# Creating a Spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    .config("spark.executor.memory", "8g")  
    .config("spark.driver.memory", "8g")  
    .config("spark.driver.maxResultSize", "2g")
    .getOrCreate()
)

sc = spark.sparkContext

# Printing the version of Apache Spark
print("Using Apache Spark Version", spark.version)


In [None]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
!wget -qO- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-5.1.3.jar | aws s3 cp - s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar
!aws s3 ls s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar

## Read the filtered data

Now that we have filtered the data to only keep submissions and comments from subreddits of interest. Let us read data from the s3 path where we saved the filtered data.

In [None]:
!aws s3 ls s3://sagemaker-us-east-1-572044129183/project/

In [None]:
public_bucket = 'sagemaker-us-east-1-572044129183'
output_prefix = 'project/comments/'
s3_input_path_comments = f"s3a://{public_bucket}/{output_prefix}"
comments = spark.read.parquet(s3_input_path_comments)
print(f"shape of the comments dataframe is {comments.count():,}x{len(comments.columns)}")

In [None]:
comments.cache()

In [None]:
# check counts (ensuring all needed subreddits exist)
comments.groupBy('subreddit').count().show()

In [None]:
comments.cache()

In [None]:
comments.printSchema()

In [None]:
comments.cache()

In [None]:
# display a subset of columns
comments.select("subreddit", "author", "body", "parent_id", "link_id", "id", "created_utc", "score").show()

In [None]:
comments.cache()

In [None]:
public_bucket = 'sagemaker-us-east-1-572044129183'
output_prefix = 'project/submissions/'
s3_input_path_submissions = f"s3a://{public_bucket}/{output_prefix}"
submissions = spark.read.parquet(s3_input_path_submissions)
print(f"shape of the submissions dataframe is {submissions.count():,}x{len(submissions.columns)}")

In [None]:
submissions.cache()

In [None]:
# check counts (ensuring all needed subreddits exist)
submissions.groupBy('subreddit').count().show()

In [None]:
submissions.cache()

In [None]:
submissions.printSchema()

In [None]:
submissions.cache()

In [None]:
# display a subset of columns
submissions.select("subreddit", "author", "title", "selftext", "created_utc", "num_comments", "score").show()

In [None]:
submissions.cache()

# Filter "AskWomen", "AskFeminists", "Feminism" by STEM Keywords

In [None]:
from pyspark.sql.functions import col

# Subreddits to filter by keywords
keyword_subreddits = ["AskWomen", "AskFeminists", "Feminism"]
# Subreddits to include all comments from
include_all_subreddits = ["xxstem", "LadiesofScience", "womenEngineers"]

# Define keywords for case-insensitive search
keywords = ["STEM", "Science", "Technology", "Engineering", "Mathematics", "Process", "Design", "Model", "Plan", "Project"]
keywords_lower = [kw.lower() for kw in keywords]

# Filter the DataFrame
comments = comments.filter(
    (col("subreddit").isin(keyword_subreddits) & col("body").rlike('|'.join(keywords_lower))) |
    (col("subreddit").isin(include_all_subreddits))
)

# Show the filtered data
comments.select("subreddit", "author", "body", "parent_id", "link_id", "id", "created_utc", "score").show()

In [None]:
comments.cache()

In [None]:
from pyspark.sql.functions import col

# Subreddits to filter by keywords
keyword_subreddits = ["AskWomen", "AskFeminists", "Feminism"]
# Subreddits to include all submissions from
include_all_subreddits = ["xxstem", "LadiesofScience", "womenEngineers"]

# Define keywords for case-insensitive search
keywords = ["STEM", "science", "technology", "engineering", "mathematics", "process", "design", "model", "plan", "project"]
# Create a regex pattern to match any keyword (case-insensitive)
pattern = '|'.join([f"(?i){kw}" for kw in keywords])

# Filter the DataFrame
# Include all submissions from certain subreddits or those that match the keyword pattern in their title or selftext
submissions = submissions.filter(
    (col("subreddit").isin(keyword_subreddits) & (col("title").rlike(pattern) | col("selftext").rlike(pattern))) |
    col("subreddit").isin(include_all_subreddits)
)

# Show the filtered data
submissions.select("subreddit", "author", "title", "selftext", "created_utc", "num_comments", "score").show()


In [None]:
submissions.cache()

# Data Cleaning

In [None]:
# Remove rows with missing values in 'author' and 'body'
comments = comments.filter(col("author").isNotNull() & col("body").isNotNull())

# Assume that 'created_utc' should be a timestamp within the last 3 years
three_years_ago = unix_timestamp(current_timestamp()) - (3 * 365 * 24 * 60 * 60)
comments = comments.filter(
    unix_timestamp(col("created_utc")) > three_years_ago
)

# Show the filtered data
comments.select("subreddit", "author", "body", "parent_id", "link_id", "id", "created_utc", "score").show()

In [None]:
comments.cache()

In [None]:
# Remove rows with missing values in 'author' or 'title'
submissions = submissions.filter(col("author").isNotNull() & col("title").isNotNull())

# Assume that 'created_utc' should be a timestamp within the last 3 years
three_years_ago = unix_timestamp(current_timestamp()) - (3 * 365 * 24 * 60 * 60)
submissions = submissions.filter(
    unix_timestamp(col("created_utc")) > three_years_ago
)

# Show the filtered data
submissions.select("subreddit", "author", "title", "selftext", "created_utc", "num_comments",  "score").show()

In [None]:
submissions.cache()

## Data Pre-processing

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql.functions import udf, col, explode, array_union
from pyspark.sql.types import StringType
import re

# Define a UDF for cleaning text
def clean_text(text):
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

clean_text_udf = udf(clean_text, StringType())

# Preprocess and tokenize text for both datasets
comments = comments.withColumn("cleaned_body", clean_text_udf("body"))

tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="words")
comments = tokenizer.transform(comments)

# Additional stop words
additional_stop_words = ["like", "dont", "im", "one", "removed", "get", "also", "even", "really", "sa", "despite", "certainly"]

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
remover.setStopWords(remover.getStopWords() + additional_stop_words)
comments = remover.transform(comments)

In [None]:
submissions = submissions.withColumn("cleaned_title", clean_text_udf("title"))
submissions = submissions.withColumn("cleaned_selftext", clean_text_udf("selftext"))

tokenizer = Tokenizer(inputCol="cleaned_title", outputCol="title_words")
submissions = tokenizer.transform(submissions)
tokenizer = Tokenizer(inputCol="cleaned_selftext", outputCol="selftext_words")
submissions = tokenizer.transform(submissions)

remover = StopWordsRemover(inputCol="title_words", outputCol="filtered_title_words")
remover.setStopWords(remover.getStopWords() + additional_stop_words)
submissions = remover.transform(submissions)

remover = StopWordsRemover(inputCol="selftext_words", outputCol="filtered_selftext_words")
remover.setStopWords(remover.getStopWords() + additional_stop_words)
submissions = remover.transform(submissions)

# Combine title and selftext words for submissions
submissions = submissions.withColumn("combined_words", array_union("filtered_title_words", "filtered_selftext_words"))

# Predict the score of a submissions based on the text of the submissions ?

## Train and Test Split

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.linalg import VectorUDT
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Split the comments dataset
train_comments, test_comments = comments.randomSplit([0.7, 0.3], seed=123)

# Split the submissions dataset
train_submissions, test_submissions = submissions.randomSplit([0.7, 0.3], seed=123)

In [None]:
# Linear Regression

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

cv_submissions = CountVectorizer(inputCol="combined_words", outputCol="features")
idf_submissions = IDF(inputCol="features", outputCol="final_features")

lr = LinearRegression(featuresCol="final_features", labelCol="score")
pipeline_lr = Pipeline(stages=[cv_submissions, idf_submissions, lr])
model_lr = pipeline_lr.fit(train_submissions)
predictions_lr = model_lr.transform(test_submissions)
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse_lr = evaluator.evaluate(predictions_lr)


In [None]:
# Random Forest
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

rf = RandomForestRegressor(featuresCol="final_features", labelCol="score")
pipeline_rf = Pipeline(stages=[cv_submissions, idf_submissions, rf])
model_rf = pipeline_rf.fit(train_submissions)
predictions_rf = model_rf.transform(test_submissions)
rmse_rf = evaluator.evaluate(predictions_rf)


In [None]:
# Gradient Boosted Trees (GBTRegressor)
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

gbt = GBTRegressor(featuresCol="final_features", labelCol="score", maxIter=10)
pipeline_gbt = Pipeline(stages=[cv_submissions, idf_submissions, gbt])
model_gbt = pipeline_gbt.fit(train_submissions)
predictions_gbt = model_gbt.transform(test_submissions)
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse_gbt = evaluator.evaluate(predictions_gbt)


In [None]:
# Decision Tree Regressor

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol="final_features", labelCol="score")
pipeline_dt = Pipeline(stages=[cv_submissions, idf_submissions, dt])
model_dt = pipeline_dt.fit(train_submissions)
predictions_dt = model_dt.transform(test_submissions)
rmse_dt = evaluator.evaluate(predictions_dt)


In [None]:
# RMSE Scores

print("Root Mean Squared Error (RMSE) on test data for Linear Regression = %g" % rmse_lr)
print("Root Mean Squared Error (RMSE) on test data for Random Forest Regressor = %g" % rmse_rf)
print("Root Mean Squared Error (RMSE) on test data for GBTRegressor = %g" % rmse_gbt)
print("Root Mean Squared Error (RMSE) on test data for DecisionTreeRegressor = %g" % rmse_dt)


In [None]:
# R-square Scores

r2_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "r2"})
print("R2 on test data for Linear Regression = %g" % r2_lr)

r2_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "r2"})
print("R2 on test data for Random Forest Regressor = %g" % r2_rf)

r2_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "r2"})
print("R2 on test data for GBTRegressor = %g" % r2_gbt)

r2_dt = evaluator.evaluate(predictions_dt, {evaluator.metricName: "r2"})
print("R2 on test data for Decision Tree Regressor = %g" % r2_dt)


# Model Comparison

In [None]:
from tabulate import tabulate

data = [
    ["Linear Regression", rmse_lr, r2_lr],
    ["Random Forest Regressor", rmse_rf, r2_rf],
    ["GBTRegressor", rmse_gbt, r2_gbt],
    ["DecisionTreeRegressor", rmse_dt, r2_dt]
]

headers = ["Model Name", "RMSE", "R-squared"]
table = tabulate(data, headers, tablefmt="grid")
print(table)

In [None]:
import plotly.graph_objects as go

models = ["Linear Regression", "Random Forest Regressor", "GBTRegressor", "DecisionTreeRegressor"]
rmse_values = [rmse_lr, rmse_rf, rmse_gbt, rmse_dt]
r2_values = [r2_lr, r2_rf, r2_gbt, r2_dt]

trace_rmse = go.Bar(x=models, y=rmse_values, name="RMSE")
trace_r2 = go.Bar(x=models, y=r2_values, name="R-squared")

fig = go.Figure()

fig.add_trace(trace_rmse)
fig.add_trace(trace_r2)

fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=["visible", [True, False]],
                    label="RMSE",
                    method="restyle"
                ),
                dict(
                    args=["visible", [False, True]],
                    label="R-squared",
                    method="restyle"
                )
            ]),
            direction="down",
            showactive=True,
        ),
    ],
    title="Model Performance Comparison",
)

fig.data[0].visible = True
fig.data[1].visible = False
fig.show()


# Can we predict which subreddit a submission came from based on the text of the submission?

In [None]:
from pyspark.sql.functions import col
subreddits_list = ["AskWomen", "Feminism", "LadiesofScience", "womenEngineers", "AskFeminists"]  
filtered_submissions = submissions.filter(col("subreddit").isin(subreddits_list))
sampled_submissions = filtered_submissions.sampleBy("subreddit", 
                                                    fractions={subreddit: 350/filtered_submissions.filter(col("subreddit") == subreddit).count() 
                                                               for subreddit in subreddits_list}, seed=123)


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Define a UDF to assign labels to each subreddit
def label_subreddit(subreddit):
    return subreddits_list.index(subreddit)

label_subreddit_udf = udf(label_subreddit, IntegerType())

# Create a new column with labels
sampled_submissions = sampled_submissions.withColumn("label", label_subreddit_udf(col("subreddit")))


In [None]:
from pyspark.sql.functions import col, concat_ws

combined_text = sampled_submissions.withColumn("text", concat_ws(" ", col("title").cast("string"), col("selftext").cast("string")))

In [None]:
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="text", outputCol="text_tokens", pattern="\\W")
tokenized_data = tokenizer.transform(combined_text)


In [None]:
train_data, test_data = tokenized_data.randomSplit([0.8, 0.2], seed=123)


In [None]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="text_tokens", outputCol="features")
cv_model = cv.fit(train_data)
train_data = cv_model.transform(train_data)
test_data = cv_model.transform(test_data)


In [None]:
# Logistic Regression

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label", family="multinomial")
lr_model = lr.fit(train_data)
predictions_lr = lr_model.transform(test_data)


In [None]:
# Random Forest

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="features", labelCol="label")
rf_model = rf.fit(train_data)
predictions_rf = rf_model.transform(test_data)



In [None]:
# Naive Bayes

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol="features", labelCol="label")
nb_model = nb.fit(train_data)
predictions_nb = nb_model.transform(test_data)


In [None]:
# Decision Tree Classifier

from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_model = dt.fit(train_data)
predictions_dt = dt_model.transform(test_data)


# Model Evaluation

In [None]:
# Confusion Matrix

predictions_pd_rf = predictions_rf.select("prediction", "label").toPandas()
predictions_pd_dt = predictions_dt.select("prediction", "label").toPandas()
predictions_pd_lr = predictions_lr.select("prediction", "label").toPandas()
predictions_pd_nb = predictions_nb.select("prediction", "label").toPandas()


In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix_rf = confusion_matrix(predictions_pd_rf['label'], predictions_pd_rf['prediction'])
conf_matrix_dt = confusion_matrix(predictions_pd_dt['label'], predictions_pd_dt['prediction'])
conf_matrix_lr = confusion_matrix(predictions_pd_lr['label'], predictions_pd_lr['prediction'])
conf_matrix_nb = confusion_matrix(predictions_pd_nb['label'], predictions_pd_nb['prediction'])


In [None]:
import plotly.graph_objects as go

fig = go.Figure()

def add_confusion_matrix_trace(cm, model_name, visible=False):
    trace = go.Heatmap(
        z=cm,
        x=['AskWomen 0', 'Feminism 1', 'LadiesofScience 2', 'womenEngineers 3', 'AskFeminists 4'],
        y=['AskWomen 0', 'Feminism 1', 'LadiesofScience 2', 'womenEngineers 3', 'AskFeminists 4'],
        colorscale='Viridis',
        showscale=False,
        visible=visible
    )
    fig.add_trace(trace)

    for i, row in enumerate(cm):
        for j, value in enumerate(row):
            fig.add_annotation(dict(
                font=dict(color="white"),
                x=j,
                y=i,
                showarrow=False,
                text=str(value),
                xref="x",
                yref="y",
                visible=visible
            ))

add_confusion_matrix_trace(conf_matrix_rf, "Random Forest", visible=True)
add_confusion_matrix_trace(conf_matrix_dt, "Decision Tree")
add_confusion_matrix_trace(conf_matrix_lr, "Logistic Regression")
add_confusion_matrix_trace(conf_matrix_nb, "Naive Bayes")

buttons = [
    dict(label="Random Forest",
         method="update",
         args=[{"visible": [True, False, False, False]},
               {"title": "Confusion Matrix for Random Forest"}]),
    dict(label="Decision Tree",
         method="update",
         args=[{"visible": [False, True, False, False]},
               {"title": "Confusion Matrix for Decision Tree"}]),
    dict(label="Logistic Regression",
         method="update",
         args=[{"visible": [False, False, True, False]},
               {"title": "Confusion Matrix for Logistic Regression"}]),
    dict(label="Naive Bayes",
         method="update",
         args=[{"visible": [False, False, False, True]},
               {"title": "Confusion Matrix for Naive Bayes"}]),
]

fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons,
            direction="down",
            pad={"r": 5, "t": 5},
            showactive=True,
            x=1.2,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ],
    title_text="Confusion Matrix for Random Forest",
    xaxis=dict(title='Predicted label'),
    yaxis=dict(title='True label')
)

fig.show()

file_path = 'CF.html'  
fig.write_html(file_path)


In [None]:
from sklearn.preprocessing import label_binarize

n_classes = 5
y_test = label_binarize(predictions_pd_lr['label'], classes=[0, 1, 2, 3, 4])

In [None]:
from sklearn.metrics import roc_curve, auc
from itertools import cycle

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], predictions_pd_lr['probability'].apply(lambda x: x[i]))
    roc_auc[i] = auc(fpr[i], tpr[i])


In [None]:
# Example mapping
class_to_subreddit = {
    0: "AskWomen",
    1: "Feminism",
    2: "LadiesofScience",
    3: "womenEngineers",
    4: "AskFeminists"
}

import plotly.graph_objects as go
from itertools import cycle

fig = go.Figure()
colors = cycle(['blue', 'red', 'green', 'yellow', 'purple'])

for i, color in zip(range(n_classes), colors):
    subreddit_name = class_to_subreddit[i]  # Get the subreddit name using the mapping
    fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines', name=f'{subreddit_name} AUC = {roc_auc[i]:.2f}', line=dict(color=color)))

fig.update_layout(
    title='ROC Curve by Subreddit',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=800,  
    height=600  
)

fig.show()

file_path = 'ROC_Curve_By_Subreddit.html'  
fig.write_html(file_path)


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="precisionByLabel")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="recallByLabel")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="precisionByLabel")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="recallByLabel")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

def calculate_metrics(predictions):
    precision = precision_evaluator.evaluate(predictions)
    recall = recall_evaluator.evaluate(predictions)
    f1 = f1_evaluator.evaluate(predictions)
    accuracy = accuracy_evaluator.evaluate(predictions)
    return precision, recall, f1, accuracy

precision_lr, recall_lr, f1_lr, accuracy_lr = calculate_metrics(predictions_lr)
precision_rf, recall_rf, f1_rf, accuracy_rf = calculate_metrics(predictions_rf)
precision_nb, recall_nb, f1_nb, accuracy_nb = calculate_metrics(predictions_nb)
precision_dt, recall_dt, f1_dt, accuracy_dt = calculate_metrics(predictions_dt)

print("Logistic Regression - Precision:", precision_lr, "Recall:", recall_lr, "F1 Score:", f1_lr, "Accuracy:", accuracy_lr)
print("Random Forest - Precision:", precision_rf, "Recall:", recall_rf, "F1 Score:", f1_rf, "Accuracy:", accuracy_rf)
print("Naive Bayes - Precision:", precision_nb, "Recall:", recall_nb, "F1 Score:", f1_nb, "Accuracy:", accuracy_nb)
print("Decision Tree - Precision:", precision_dt, "Recall:", recall_dt, "F1 Score:", f1_dt, "Accuracy:", accuracy_dt)


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as F
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import numpy as np

def calculate_auc_per_class(predictions, num_classes, probability_col, label_col):
    """
    Calculate the AUROC per class for a multiclass classification problem.
    """
    auroc_per_class = []

    # Convert dataframe to RDD
    for class_index in range(num_classes):
        # Prepare the data - convert to binary problem
        binary_prediction = predictions.select(probability_col, label_col).rdd.map(lambda row: (float(row[probability_col][class_index]), 1.0 if row[label_col] == class_index else 0.0))

        # Compute the metric for this binary problem
        metrics = BinaryClassificationMetrics(binary_prediction)
        auroc = metrics.areaUnderROC
        auroc_per_class.append(auroc)

    return auroc_per_class

# Calculate AUROC for each class in Logistic Regression model
num_classes = len(subreddits_list)
auroc_per_class_lr = calculate_auc_per_class(predictions_lr, num_classes, "probability", "label")

# Calculate the average AUROC
average_auroc_lr = np.mean(auroc_per_class_lr)
average_auroc_lr, auroc_per_class_lr



In [None]:
auroc_per_class_rf = calculate_auc_per_class(predictions_rf, num_classes, "probability", "label")
average_auroc_rf = np.mean(auroc_per_class_rf)


In [None]:
auroc_per_class_nb = calculate_auc_per_class(predictions_nb, num_classes, "probability", "label")
average_auroc_nb = np.mean(auroc_per_class_nb)


In [None]:
auroc_per_class_dt = calculate_auc_per_class(predictions_dt, num_classes, "probability", "label")
average_auroc_dt = np.mean(auroc_per_class_dt)


In [None]:
average_auroc_rf, auroc_per_class_rf


In [None]:
average_auroc_nb, auroc_per_class_nb


In [None]:
average_auroc_dt, auroc_per_class_dt


In [None]:
def calculate_class_roc(predictions, num_classes, probability_col, label_col, num_thresholds=100):
    roc_auc_dict = {}

    for class_index in range(num_classes):
        binary_prediction = predictions.select(probability_col, label_col).rdd.map(
            lambda row: (float(row[probability_col][class_index]), 1.0 if row[label_col] == class_index else 0.0)
        )
        scores_and_labels = binary_prediction.collect()

        # Use a common set of thresholds
        thresholds = np.linspace(0, 1, num_thresholds)
        roc_points = []

        for threshold in thresholds:
            tp = fp = tn = fn = 0
            for score, label in scores_and_labels:
                if score >= threshold:
                    if label == 1.0:
                        tp += 1
                    else:
                        fp += 1
                else:
                    if label == 1.0:
                        fn += 1
                    else:
                        tn += 1

            fpr = fp / (fp + tn) if (fp + tn) else 0
            tpr = tp / (tp + fn) if (tp + fn) else 0
            roc_points.append((fpr, tpr))

        # Calculate AUC
        fpr, tpr = zip(*roc_points)
        roc_auc = np.trapz(tpr, fpr)

        roc_auc_dict[class_index] = (fpr, tpr, roc_auc)

    return roc_auc_dict


In [None]:
# Logistic Regression
roc_auc_lr = calculate_class_roc(predictions_lr, num_classes, "probability", "label")

# Repeat for other models
roc_auc_rf = calculate_class_roc(predictions_rf, num_classes, "probability", "label")
roc_auc_nb = calculate_class_roc(predictions_nb, num_classes, "probability", "label")
roc_auc_dt = calculate_class_roc(predictions_dt, num_classes, "probability", "label")


In [None]:
def aggregate_roc_data(roc_auc_dict, num_thresholds=100):
    # Initialize lists for aggregated TPR and FPR
    aggregated_fpr = np.linspace(0, 1, num_thresholds)
    aggregated_tpr = []

    for threshold in aggregated_fpr:
        tpr_list = []

        for _, (fpr, tpr, _) in roc_auc_dict.items():
            # Find closest FPR point and get corresponding TPR
            closest_fpr_index = np.argmin(np.abs(np.array(fpr) - threshold))
            tpr_list.append(tpr[closest_fpr_index])

        # Average TPR across all classes for this threshold
        aggregated_tpr.append(np.mean(tpr_list))

    # Calculate AUC
    aggregated_auc = np.trapz(aggregated_tpr, aggregated_fpr)

    return aggregated_fpr, aggregated_tpr, aggregated_auc


In [None]:
# Aggregate ROC data for each model
aggregated_roc_lr = aggregate_roc_data(roc_auc_lr)
aggregated_roc_rf = aggregate_roc_data(roc_auc_rf)
aggregated_roc_nb = aggregate_roc_data(roc_auc_nb)
aggregated_roc_dt = aggregate_roc_data(roc_auc_dt)


In [None]:
import plotly.graph_objs as go

# Create traces for each model
trace_lr = go.Scatter(x=aggregated_roc_lr[0], y=aggregated_roc_lr[1],
                      mode='lines', name=f'Logistic Regression (AUC = {aggregated_roc_lr[2]:.2f})')
trace_rf = go.Scatter(x=aggregated_roc_rf[0], y=aggregated_roc_rf[1],
                      mode='lines', name=f'Random Forest (AUC = {aggregated_roc_rf[2]:.2f})')
trace_nb = go.Scatter(x=aggregated_roc_nb[0], y=aggregated_roc_nb[1],
                      mode='lines', name=f'Naive Bayes (AUC = {aggregated_roc_nb[2]:.2f})')
trace_dt = go.Scatter(x=aggregated_roc_dt[0], y=aggregated_roc_dt[1],
                      mode='lines', name=f'Decision Tree (AUC = {aggregated_roc_dt[2]:.2f})')

# Define layout
layout = go.Layout(
    title='Aggregated Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    legend=dict(x=1.1, y=0.9),
    margin=dict(l=40, r=40, b=40, t=40)
)

# Create figure and add traces
fig = go.Figure(data=[trace_lr, trace_rf, trace_nb, trace_dt], layout=layout)

# Show plot
fig.show()

file_path = 'ROC.html'  
fig.write_html(file_path)

# Q3: FIND COMMON THEMES OR SIMILARITIES IN  COMMENTS ACROSS DIFFERENT SUBREDDITS

In [None]:
pip install scikit-learn nltk


In [None]:
comments = comments.filter(col("subreddit") == "xxstem")
comments.cache()

## K-Means CLustering

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator

# Convert tokens to TF-IDF features
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")

# KMeans Clustering
num_clusters = 3  # Set the number of clusters
kmeans = KMeans(featuresCol="features", k=num_clusters, seed=123)

# Create a pipeline and fit it
pipeline = Pipeline(stages=[cv, idf, kmeans])
model = pipeline.fit(comments)

# Predict clusters
predictions = model.transform(comments)

# Evaluating the clustering
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance = {silhouette}")

# Show the results
predictions.select("filtered_words", "prediction").show()

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# Function to convert Spark DataFrame vector column to a NumPy array
def vector_to_array(v):
    return np.array(v.toArray()).tolist()

features_array = np.array(predictions.select('features').rdd.map(lambda row: vector_to_array(row.features)).collect())

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(features_array)

# Creating a DataFrame for Plotly
plotly_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
plotly_df['Cluster'] = np.array(predictions.select('prediction').rdd.map(lambda row: row.prediction).collect())

# Color mapping for clusters
colors = px.colors.qualitative.Plotly

# Create a scatter plot
fig = go.Figure()

for i, color in enumerate(colors[:num_clusters]):
    cluster_df = plotly_df[plotly_df['Cluster'] == i]
    fig.add_trace(go.Scatter(x=cluster_df['PCA1'], y=cluster_df['PCA2'], 
                             mode='markers', 
                             name=f'Cluster {i}',
                             marker_color=color))

fig.update_layout(title='PCA of K-Means Clustering',
                  xaxis_title='PCA Component 1',
                  yaxis_title='PCA Component 2',
                  legend_title='Cluster')

# Show the plot
fig.show()
fig.write_html("pca.html")

## Hierachial Clustering

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import Vectors

# Function to convert vector to array
def vector_to_array(v):
    return v.toArray().tolist()

# UDF for converting vector to array
vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

features_df = predictions.withColumn("features_array", vector_to_array_udf("features"))
features_list = features_df.select("features_array").rdd.map(lambda x: x[0]).collect()


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
import plotly.figure_factory as ff

# Convert to numpy array
features_np = np.array(features_list)

# Hierarchical Clustering
linked = linkage(features_np, 'ward')

# Creating the dendrogram
fig = ff.create_dendrogram(linked, orientation='left')
fig.update_layout(width=800, height=800)
fig.show()

agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage='ward')
labels = agg_clustering.fit_predict(features_np)

# Calculating Silhouette Score
silhouette_avg = silhouette_score(features_np, labels)
print(f'Silhouette Score: {silhouette_avg}')


Disclaimer: The work has been by us. However, we have taken assissstance from ChatGPT for code commenting and cleaning.