# Advanced Machine Learning Models in PySpark

This notebook implements multiple classification models using PySpark for big data processing:
1. Logistic Regression
2. Random Forest
3. Gradient Boosted Decision Trees (GBDT)
4. Multilayer Perceptron (MLP)
5. XGBoost (dependency required)
6. LightGBM (dependency required)

Each model is evaluated with:
- Confusion Matrix
- Classification Report
- ROC Curve and AUC
- Precision-Recall Curve

## 1. Importing Libraries

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from math import ceil

# PySpark imports
from pyspark.sql import SparkSession  
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, when, lit, count, lag, expr

# ML imports for classification
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, OneVsRest
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

# Scikit-learn imports for metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve

# Initialize Spark session
spark = SparkSession.builder.appName("ML Models Spark").getOrCreate()

## 2. Loading and Preprocessing Data

In [None]:
# File location and type
file_location_train = "dbfs:/FileStore/tables/train_df-2.csv"
file_location_val = "dbfs:/FileStore/tables/val_df.csv"
file_location_test = "dbfs:/FileStore/tables/test_df-2.csv"

train_data = spark.read.csv(file_location_train, header=True, inferSchema=True)
val_data = spark.read.csv(file_location_val, header=True, inferSchema=True)
test_data = spark.read.csv(file_location_test, header=True, inferSchema=True)

In [None]:
# Select feature columns (all except 'label', 'time', and 'file')
feature_cols = [col for col in train_data.columns if col not in ['label', 'time', 'file']]

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_data = assembler.transform(train_data).select("features", "label")
val_data = assembler.transform(val_data).select("features", "label")
test_data = assembler.transform(test_data).select("features", "label")

# Display the transformed train_data to verify
train_data.show(5)
val_data.show(5)

## 3. Helper Functions for Evaluation Metrics and Visualization

In [None]:
# Helper function to convert PySpark predictions to numpy arrays for plotting
def get_prediction_labels(predictions_df):
    pred_labels = predictions_df.select("prediction", "label").toPandas()
    y_pred = pred_labels["prediction"].values
    y_true = pred_labels["label"].values
    return y_pred, y_true

# Helper function to get prediction probabilities
def get_prediction_probabilities(predictions_df):
    prob_df = predictions_df.select("probability").toPandas()
    return np.array([x.toArray() for x in prob_df["probability"]])

# Helper function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

# Helper function to print classification report
def print_classification_report(y_true, y_pred):
    report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(report)

# Helper function to plot ROC curve for multi-class
def plot_roc_curve(y_true, y_pred_proba, n_classes):
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    # Convert to one-hot encoding for ROC calculation
    y_true_onehot = np.zeros((len(y_true), n_classes))
    for i in range(len(y_true)):
        y_true_onehot[i, int(y_true[i])] = 1
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_onehot[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Plot all ROC curves
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    
    for i, color in zip(range(n_classes), colors[:n_classes]):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'.
                 format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-class ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    # Return the average AUC
    return np.mean(list(roc_auc.values()))

# Helper function to plot precision-recall curve
def plot_precision_recall_curve(y_true, y_pred_proba, n_classes):
    # Compute precision-recall pairs for different probability thresholds
    precision = dict()
    recall = dict()
    avg_precision = dict()
    
    # Convert to one-hot encoding
    y_true_onehot = np.zeros((len(y_true), n_classes))
    for i in range(len(y_true)):
        y_true_onehot[i, int(y_true[i])] = 1
    
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_true_onehot[:, i], y_pred_proba[:, i])
        avg_precision[i] = np.mean(precision[i])
    
    # Plot precision-recall curve for each class
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    
    for i, color in zip(range(n_classes), colors[:n_classes]):
        plt.plot(recall[i], precision[i], color=color, lw=2,
                 label='Class {0} (avg precision = {1:0.2f})'.
                 format(i, avg_precision[i]))
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Multi-class Precision-Recall Curve')
    plt.legend(loc="best")
    plt.show()
    
    return np.mean(list(avg_precision.values()))

## 4. Logistic Regression Model with Hyperparameter Tuning

### 4.1. Random Search for Hyperparameter Optimization

In [None]:
# Initialize the Logistic Regression model
log_reg = LogisticRegression(labelCol='label', featuresCol='features', maxIter=5, predictionCol='prediction')

# Define the parameter grid for logistic regression
param_grid = ParamGridBuilder() \
    .addGrid(log_reg.regParam, [0.1, 1, 10]) \
    .addGrid(log_reg.elasticNetParam, [0.0, 0.5]) \
    .build()

# Initialize the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')

# Initialize CrossValidator for hyperparameter tuning
crossval = CrossValidator(
    estimator=log_reg,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=5,  # 5-fold cross-validation
    parallelism=2  # Parallelism to optimize resource usage
)

# Fit the cross-validator to the training data
cv_model = crossval.fit(train_data)

# Extract the best model
best_model = cv_model.bestModel

# Print tuned parameters
tuned_params = ['regParam', 'elasticNetParam']
best_params = {}

for param in tuned_params:
    if best_model.hasParam(param) and best_model.isSet(getattr(log_reg, param)):
        best_params[param] = best_model.getOrDefault(getattr(log_reg, param))

print("Best Tuned Parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# Print the best parameters
print("Best Coefficient Matrix:")
print(best_model.coefficientMatrix)
print("Best Intercept Vector:")
print(best_model.interceptVector)

# Print the best F1 score
print("Best F1 Score:", evaluator.evaluate(best_model.transform(val_data)))

### 4.2. Logistic Regression Model Implementation

In [None]:
# Create the logistic regression model
log_reg = LogisticRegression(labelCol='label', featuresCol='features', maxIter=5, family='multinomial', regParam=5, elasticNetParam=0)

# Fit the model to the training set
lr_model = log_reg.fit(train_data)

# Make predictions on the training and validation sets
train_predictions = lr_model.transform(train_data)
val_predictions = lr_model.transform(val_data)

# Initialize the evaluator for F1 score
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')

# Calculate F1 scores
score_train = evaluator.evaluate(train_predictions)
score_val = evaluator.evaluate(val_predictions)

print("Training F1 Score:", score_train)
print("Validation F1 Score:", score_val)

# Extract predictions for later use
lr_y_pred, lr_y_true = get_prediction_labels(val_predictions)
lr_y_pred_proba = get_prediction_probabilities(val_predictions)

### 4.3. One vs Rest Logistic Regression

In [None]:
# Create the logistic regression model
log_reg = LogisticRegression(labelCol='label', featuresCol='features', maxIter=5, regParam=5, elasticNetParam=0.5)

# One-vs-Rest strategy
ovr = OneVsRest(classifier=log_reg)

# Fit the model to the training set
lr_ovr_model = ovr.fit(train_data)

# Make predictions on the training and validation sets
train_predictions_ovr = lr_ovr_model.transform(train_data)
val_predictions_ovr = lr_ovr_model.transform(val_data)

# Calculate F1 scores
score_train_ovr = evaluator.evaluate(train_predictions_ovr)
score_val_ovr = evaluator.evaluate(val_predictions_ovr)

print("OVR Training F1 Score:", score_train_ovr)
print("OVR Validation F1 Score:", score_val_ovr)

# Extract predictions for later use
lr_ovr_y_pred, lr_ovr_y_true = get_prediction_labels(val_predictions_ovr)

## 5. Random Forest Model

In [None]:
# Initialize Random Forest Classifier
rf = RandomForestClassifier(
    labelCol="label", 
    featuresCol="features",
    numTrees=100,           # Number of trees
    maxDepth=10,            # Maximum depth of each tree 
    seed=42                 # For reproducibility
)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions
rf_train_predictions = rf_model.transform(train_data)
rf_val_predictions = rf_model.transform(val_data)

# Calculate F1 scores
rf_train_f1 = evaluator.evaluate(rf_train_predictions)
rf_val_f1 = evaluator.evaluate(rf_val_predictions)

print("Random Forest - Training F1 Score:", rf_train_f1)
print("Random Forest - Validation F1 Score:", rf_val_f1)

# Extract predictions and probabilities for evaluation
rf_y_pred, rf_y_true = get_prediction_labels(rf_val_predictions)
rf_y_pred_proba = get_prediction_probabilities(rf_val_predictions)

## 6. Gradient Boosted Decision Trees (GBDT) Model

In [None]:
# Initialize GBT Classifier
gbt = GBTClassifier(
    labelCol="label", 
    featuresCol="features",
    maxIter=10,             # Number of iterations
    maxDepth=5,             # Maximum depth of each tree
    stepSize=0.1,           # Learning rate
    seed=42                 # For reproducibility
)

# Train the model
gbt_model = gbt.fit(train_data)

# Make predictions
gbt_train_predictions = gbt_model.transform(train_data)
gbt_val_predictions = gbt_model.transform(val_data)

# Calculate F1 scores
gbt_train_f1 = evaluator.evaluate(gbt_train_predictions)
gbt_val_f1 = evaluator.evaluate(gbt_val_predictions)

print("GBDT - Training F1 Score:", gbt_train_f1)
print("GBDT - Validation F1 Score:", gbt_val_f1)

# Extract predictions for later evaluation
gbt_y_pred, gbt_y_true = get_prediction_labels(gbt_val_predictions)

## 7. Multilayer Perceptron (MLP) Model

In [None]:
# Get number of features and classes
num_features = len(train_data.select("features").first()[0])
num_classes = train_data.select("label").distinct().count()

# Define the network architecture
# Input layer: number of features
# Hidden layers: twice the number of features, then half that
# Output layer: number of classes
layers = [num_features, num_features * 2, num_features, num_classes]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    labelCol="label",
    featuresCol="features",
    layers=layers,
    blockSize=128,          # Block size for stochastic gradient descent
    seed=42,                # For reproducibility
    maxIter=100             # Maximum iterations
)

# Train the model
mlp_model = mlp.fit(train_data)

# Make predictions
mlp_train_predictions = mlp_model.transform(train_data)
mlp_val_predictions = mlp_model.transform(val_data)

# Calculate F1 scores
mlp_train_f1 = evaluator.evaluate(mlp_train_predictions)
mlp_val_f1 = evaluator.evaluate(mlp_val_predictions)

print("MLP - Training F1 Score:", mlp_train_f1)
print("MLP - Validation F1 Score:", mlp_val_f1)

# Extract predictions for later evaluation
mlp_y_pred, mlp_y_true = get_prediction_labels(mlp_val_predictions)

## 8. XGBoost and LightGBM (Commented Out - Requires Additional Dependencies)

To use XGBoost and LightGBM with PySpark, you'll need to add the following dependencies to your cluster:
- For XGBoost: `com.github.ozancicek:spark-xgboost_2.12:0.2.1`
- For LightGBM: `com.microsoft.ml.spark:mmlspark_2.12:1.0.0-rc1`

Uncomment the code below after adding these dependencies to your cluster.

In [None]:
# XGBoost implementation (uncomment after adding required dependencies)

'''
from com.github.ozancicek.spark.ml.xgboost import XGBoostClassifier

# Initialize XGBoost Classifier
xgb = XGBoostClassifier(
    labelCol="label",
    featuresCol="features",
    numRound=100,
    maxDepth=5,
    eta=0.1,
    seed=42
)

# Train the model
xgb_model = xgb.fit(train_data)

# Make predictions
xgb_train_predictions = xgb_model.transform(train_data)
xgb_val_predictions = xgb_model.transform(val_data)

# Calculate F1 scores
xgb_train_f1 = evaluator.evaluate(xgb_train_predictions)
xgb_val_f1 = evaluator.evaluate(xgb_val_predictions)

print("XGBoost - Training F1 Score:", xgb_train_f1)
print("XGBoost - Validation F1 Score:", xgb_val_f1)

# Extract predictions for later evaluation
xgb_y_pred, xgb_y_true = get_prediction_labels(xgb_val_predictions)
xgb_y_pred_proba = get_prediction_probabilities(xgb_val_predictions)
'''

In [None]:
# LightGBM implementation (uncomment after adding required dependencies)

'''
from com.microsoft.ml.spark.lightgbm import LightGBMClassifier

# Initialize LightGBM Classifier
lgbm = LightGBMClassifier(
    labelCol="label",
    featuresCol="features",
    numLeaves=31,
    numIterations=100,
    learningRate=0.1
)

# Train the model
lgbm_model = lgbm.fit(train_data)

# Make predictions
lgbm_train_predictions = lgbm_model.transform(train_data)
lgbm_val_predictions = lgbm_model.transform(val_data)

# Calculate F1 scores
lgbm_train_f1 = evaluator.evaluate(lgbm_train_predictions)
lgbm_val_f1 = evaluator.evaluate(lgbm_val_predictions)

print("LightGBM - Training F1 Score:", lgbm_train_f1)
print("LightGBM - Validation F1 Score:", lgbm_val_f1)

# Extract predictions for later evaluation
lgbm_y_pred, lgbm_y_true = get_prediction_labels(lgbm_val_predictions)
lgbm_y_pred_proba = get_prediction_probabilities(lgbm_val_predictions)
'''

## 9. Evaluation Metrics and Visualizations

### 9.1. Confusion Matrices

In [None]:
# Plot confusion matrices for each model
print("Confusion Matrices:")
plot_confusion_matrix(lr_y_true, lr_y_pred, "Logistic Regression Confusion Matrix")
plot_confusion_matrix(rf_y_true, rf_y_pred, "Random Forest Confusion Matrix")
plot_confusion_matrix(gbt_y_true, gbt_y_pred, "GBDT Confusion Matrix")
plot_confusion_matrix(mlp_y_true, mlp_y_pred, "MLP Confusion Matrix")

### 9.2. Classification Reports

In [None]:
# Print classification reports for each model
print("Logistic Regression Classification Report:")
print_classification_report(lr_y_true, lr_y_pred)

print("\nRandom Forest Classification Report:")
print_classification_report(rf_y_true, rf_y_pred)

print("\nGBDT Classification Report:")
print_classification_report(gbt_y_true, gbt_y_pred)

print("\nMLP Classification Report:")
print_classification_report(mlp_y_true, mlp_y_pred)

### 9.3. ROC Curves and AUC

In [None]:
# Get number of classes for ROC curves
num_classes = int(train_data.select("label").distinct().count())

# Plot ROC curves
print("ROC Curves and AUC:")
lr_auc = plot_roc_curve(lr_y_true, lr_y_pred_proba, num_classes)
print(f"Logistic Regression Average AUC: {lr_auc:.4f}")

rf_auc = plot_roc_curve(rf_y_true, rf_y_pred_proba, num_classes)
print(f"Random Forest Average AUC: {rf_auc:.4f}")

### 9.4. Precision-Recall Curves

In [None]:
# Plot Precision-Recall curves
print("Precision-Recall Curves:")
lr_avg_prec = plot_precision_recall_curve(lr_y_true, lr_y_pred_proba, num_classes)
print(f"Logistic Regression Average Precision: {lr_avg_prec:.4f}")

rf_avg_prec = plot_precision_recall_curve(rf_y_true, rf_y_pred_proba, num_classes)
print(f"Random Forest Average Precision: {rf_avg_prec:.4f}")

## 10. Model Comparison

In [None]:
# Compare all models
models = ["Logistic Regression", "Logistic Regression (OVR)", "Random Forest", "GBDT", "MLP"]
train_scores = [score_train, score_train_ovr, rf_train_f1, gbt_train_f1, mlp_train_f1]
val_scores = [score_val, score_val_ovr, rf_val_f1, gbt_val_f1, mlp_val_f1]

# Create a comparison DataFrame
model_comparison = pd.DataFrame({
    'Model': models,
    'Training F1': train_scores,
    'Validation F1': val_scores
})

print("Model Performance Comparison:")
print(model_comparison)

# Plot model comparison
plt.figure(figsize=(12, 6))
ind = np.arange(len(models))
width = 0.35

plt.bar(ind - width/2, train_scores, width, label='Training F1')
plt.bar(ind + width/2, val_scores, width, label='Validation F1')

plt.ylabel('F1 Score')
plt.title('Model Comparison')
plt.xticks(ind, models, rotation=15)
plt.legend(loc='best')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on top of bars
for i, v in enumerate(train_scores):
    plt.text(i - width/2, v + 0.01, f'{v:.4f}', ha='center')
    
for i, v in enumerate(val_scores):
    plt.text(i + width/2, v + 0.01, f'{v:.4f}', ha='center')

plt.tight_layout()
plt.show()

## 11. Test Set Evaluation with Best Model

Based on the validation results, let's evaluate the best performing model on the test set.

In [None]:
# Find the best model based on validation F1 scores
best_model_index = val_scores.index(max(val_scores))
best_model_name = models[best_model_index]
print(f"Best Model: {best_model_name} with Validation F1: {max(val_scores):.4f}")

# Get the corresponding model object
if best_model_name == "Logistic Regression":
    best_model = lr_model
    test_predictions = best_model.transform(test_data)
elif best_model_name == "Logistic Regression (OVR)":
    best_model = lr_ovr_model
    test_predictions = best_model.transform(test_data)
elif best_model_name == "Random Forest":
    best_model = rf_model
    test_predictions = best_model.transform(test_data)
elif best_model_name == "GBDT":
    best_model = gbt_model
    test_predictions = best_model.transform(test_data)
elif best_model_name == "MLP":
    best_model = mlp_model
    test_predictions = best_model.transform(test_data)

# Evaluate on test set
test_f1 = evaluator.evaluate(test_predictions)
print(f"Test F1 Score with {best_model_name}: {test_f1:.4f}")

# Get predictions and true labels
test_y_pred, test_y_true = get_prediction_labels(test_predictions)

# Plot confusion matrix for test set
plot_confusion_matrix(test_y_true, test_y_pred, f"{best_model_name} Confusion Matrix (Test Set)")

# Print classification report for test set
print(f"\n{best_model_name} Classification Report (Test Set):")
print_classification_report(test_y_true, test_y_pred)

# If model provides probability predictions, plot ROC and PR curves
if best_model_name in ["Logistic Regression", "Random Forest"]:
    test_y_pred_proba = get_prediction_probabilities(test_predictions)
    
    # Plot ROC curve
    test_auc = plot_roc_curve(test_y_true, test_y_pred_proba, num_classes)
    print(f"{best_model_name} Average AUC (Test Set): {test_auc:.4f}")
    
    # Plot Precision-Recall curve
    test_avg_prec = plot_precision_recall_curve(test_y_true, test_y_pred_proba, num_classes)
    print(f"{best_model_name} Average Precision (Test Set): {test_avg_prec:.4f}")

## 12. Conclusion

In this notebook, we've implemented and evaluated multiple classification models using PySpark:

1. Logistic Regression (standard and One-vs-Rest)
2. Random Forest
3. Gradient Boosted Decision Trees (GBDT)
4. Multilayer Perceptron (MLP)

We've also provided code for XGBoost and LightGBM which require additional dependencies.

For each model, we've evaluated:
- Training and validation F1 scores
- Confusion matrices
- Detailed classification reports
- ROC curves and AUC values
- Precision-Recall curves

The best performing model was evaluated on the test set to provide a final assessment of its performance.

This comprehensive evaluation allows for informed model selection based on the specific requirements of the classification task.