# Scrapped

## Models

In [None]:
lr = LogisticRegression(featuresCol=feature_col, labelCol="label", maxIter=10)
rf = RandomForestClassifier(featuresCol=feature_col, labelCol="label", numTrees=10)
gbt = GBTClassifier(featuresCol=feature_col, labelCol="label", maxIter=10)

## Parameter search grids for hyperparameter tuning

In [None]:
# Create parameter grids for hyperparameter tuning
lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

gbt_param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10, 15]) \
    .addGrid(gbt.stepSize, [0.1, 0.2]) \
    .build()

# Define evaluator to measure model performance
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)

lr_pipeline = Pipeline(stages=[label_indexer, lr])
rf_pipeline = Pipeline(stages=[label_indexer, rf])
gbt_pipeline = Pipeline(stages=[label_indexer, gbt])

## CV for each model

In [None]:
lr_cv = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=lr_param_grid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)

rf_cv = CrossValidator(
    estimator=rf_pipeline,
    estimatorParamMaps=rf_param_grid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)

gbt_cv = CrossValidator(
    estimator=gbt_pipeline,
    estimatorParamMaps=gbt_param_grid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)


In [None]:
print("Training Logistic Regression model...")
lr_model = lr_pipeline.fit(train_data)
print("Training Random Forest model...")
rf_model = rf_pipeline.fit(train_data)
print("Training Gradient Boosted Trees model...")
gbt_model = gbt_pipeline.fit(train_data)

Training Logistic Regression model...


ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
def evaluate_model(model, name):
    predictions = model.transform(test_data)
    accuracy = evaluator.evaluate(predictions)
    
    # For multiclass classification, we can also look at precision, recall, f1
    evaluator.setMetricName("weightedPrecision")
    precision = evaluator.evaluate(predictions)
    
    evaluator.setMetricName("weightedRecall")
    recall = evaluator.evaluate(predictions)
    
    evaluator.setMetricName("f1")
    f1 = evaluator.evaluate(predictions)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-------------------------------------")
    
    return accuracy, precision, recall, f1, predictions

lr_metrics = evaluate_model(lr_model, "Logistic Regression")
rf_metrics = evaluate_model(rf_model, "Random Forest")
gbt_metrics = evaluate_model(gbt_model, "Gradient Boosted Trees")

In [None]:
models = [("Logistic Regression", lr_model, lr_metrics), 
          ("Random Forest", rf_model, rf_metrics), 
          ("Gradient Boosted Trees", gbt_model, gbt_metrics)]

best_model_name, best_model, best_metrics = max(models, key=lambda x: x[2][3])
best_predictions = best_metrics[4]

print(f"Best model: {best_model_name} with F1 score: {best_metrics[3]:.4f}")

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

pred_and_labels = best_predictions.select("prediction", "label")
confusion_matrix = pred_and_labels.groupBy("prediction").pivot("label").count().fillna(0)
print("Confusion Matrix:")
confusion_matrix.show()

if best_model_name in ["Random Forest", "Gradient Boosted Trees"]:
    feature_importances = best_model.bestModel.stages[1].featureImportances
    
    # Map feature indices to feature names
    feature_names = all_features
    importances = [(feature_names[i], importance) 
                  for i, importance in enumerate(feature_importances)]
    
    # Sort by importance
    importances.sort(key=lambda x: x[1], descending=True)
    
    print("Feature Importances:")
    for name, importance in importances[:20]:  # Top 20 features
        print(f"{name}: {importance:.4f}")
        