# Set Up PySpark

In [None]:
!pip install pyspark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [None]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("Classification")\
    .getOrCreate()

# Load data from Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
from pathlib import Path
current_dir = '/content/drive/MyDrive'
data_relative_path = 'Data'

df = spark.read.csv(os.path.join(current_dir, data_relative_path, "featureSelected_encoded_merged_application.csv"), header=True, inferSchema=True)

# Split the data (with manual stratification)

In [None]:
seed=42
class_0 = df.filter(F.col("TARGET") == 0)
class_1 = df.filter(F.col("TARGET") == 1)

class_0_train, class_0_val, class_0_test = class_0.randomSplit([0.7, 0.1, 0.2], seed=seed)

class_1_train, class_1_val, class_1_test = class_1.randomSplit([0.7, 0.1, 0.2], seed=seed)

train_df = class_0_train.union(class_1_train)
val_df = class_0_val.union(class_1_val)
test_df = class_0_test.union(class_1_test)

In [None]:
train_df = train_df.orderBy(F.rand(seed))
val_df = val_df.orderBy(F.rand(seed))
test_df = test_df.orderBy(F.rand(seed))

In [None]:
print((train_df.count(), len(train_df.columns)))
print((val_df.count(), len(val_df.columns)))
print((test_df.count(), len(test_df.columns)))

(162700, 65)
(23077, 65)
(46419, 65)


## Check the data is splited with correct stratification cz(data is biased)

In [None]:
total_count = train_df.count()

value_counts = train_df.groupBy("TARGET").count()

percentage_counts = value_counts.withColumn('percentage', (F.col('count') / total_count) * 100)

percentage_counts.show()

+------+------+-----------------+
|TARGET| count|       percentage|
+------+------+-----------------+
|     0|149769|92.05224339274739|
|     1| 12931|7.947756607252613|
+------+------+-----------------+



In [None]:
total_count = val_df.count()

value_counts = val_df.groupBy("TARGET").count()

percentage_counts = value_counts.withColumn('percentage', (F.col('count') / total_count) * 100)

percentage_counts.show()

+------+-----+-----------------+
|TARGET|count|       percentage|
+------+-----+-----------------+
|     0|21276| 92.1956926810244|
|     1| 1801|7.804307318975604|
+------+-----+-----------------+



In [None]:
total_count = test_df.count()

value_counts = test_df.groupBy("TARGET").count()

percentage_counts = value_counts.withColumn('percentage', (F.col('count') / total_count) * 100)

percentage_counts.show()

+------+-----+------------------+
|TARGET|count|        percentage|
+------+-----+------------------+
|     0|42855| 92.32210948103148|
|     1| 3564|7.6778905189685265|
+------+-----+------------------+



In [None]:
target_column = 'TARGET'

# For train_df
X_train = train_df.drop(target_column)
y_train = train_df.select(target_column)

# For val_df
X_val = val_df.drop(target_column)
y_val = val_df.select(target_column)

# For test_df
X_test = test_df.drop(target_column)
y_test = test_df.select(target_column)

# Modeling

In [None]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from xgboost import XGBClassifier
import numpy as np
from pyspark.ml.feature import VectorAssembler

In [None]:
len(train_df.columns)

65

In [None]:
feature_columns=[col for col in train_df.columns]

In [None]:
feature_columns.remove("TARGET")

In [None]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")


X_train = assembler.transform(train_df)
X_val = assembler.transform(val_df)
X_test = assembler.transform(test_df)

In [None]:
binary_evaluator = BinaryClassificationEvaluator(labelCol="TARGET")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", metricName="f1")

## 1. RandomForest Classifier

In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="TARGET", seed=42)
rf_model = rf.fit(X_train)

rf_train_predictions = rf_model.transform(X_train)
rf_val_predictions = rf_model.transform(X_val)
rf_test_predictions = rf_model.transform(X_test)

In [None]:
rf_train_auc = binary_evaluator.evaluate(rf_train_predictions)
rf_val_auc = binary_evaluator.evaluate(rf_val_predictions)
rf_test_auc = binary_evaluator.evaluate(rf_test_predictions)

In [None]:
rf_train_accuracy = accuracy_evaluator.evaluate(rf_train_predictions)
rf_val_accuracy = accuracy_evaluator.evaluate(rf_val_predictions)
rf_test_accuracy = accuracy_evaluator.evaluate(rf_test_predictions)

In [None]:
rf_train_f1 = f1_evaluator.evaluate(rf_train_predictions)
rf_val_f1 = f1_evaluator.evaluate(rf_val_predictions)
rf_test_f1 = f1_evaluator.evaluate(rf_test_predictions)

In [None]:
print(f"RandomForest AUC - Train: {rf_train_auc}, Validation: {rf_val_auc}, Test: {rf_test_auc}")
print(f"RandomForest Accuracy - Train: {rf_train_accuracy}, Validation: {rf_val_accuracy}, Test: {rf_test_accuracy}")
print(f"RandomForest F1 Score - Train: {rf_train_f1}, Validation: {rf_val_f1}, Test: {rf_test_f1}")

RandomForest AUC - Train: 0.6565381114570914, Validation: 0.6476129960178587, Test: 0.6468449418542753
RandomForest Accuracy - Train: 0.9205224339274739, Validation: 0.921956926810244, Test: 0.9232210948103148
RandomForest F1 Score - Train: 0.882428173078826, Validation: 0.8845199005620704, Test: 0.8863642273919851


## 2. Gradient Boosting Classifier (GBT)

In [None]:
gbt = GBTClassifier(featuresCol="features", labelCol="TARGET", seed=42)
gbt_model = gbt.fit(X_train)

gbt_train_predictions = gbt_model.transform(X_train)
gbt_val_predictions = gbt_model.transform(X_val)
gbt_test_predictions = gbt_model.transform(X_test)


gbt_train_auc = binary_evaluator.evaluate(gbt_train_predictions)
gbt_val_auc = binary_evaluator.evaluate(gbt_val_predictions)
gbt_test_auc = binary_evaluator.evaluate(gbt_test_predictions)


gbt_train_accuracy = accuracy_evaluator.evaluate(gbt_train_predictions)
gbt_val_accuracy = accuracy_evaluator.evaluate(gbt_val_predictions)
gbt_test_accuracy = accuracy_evaluator.evaluate(gbt_test_predictions)


gbt_train_f1 = f1_evaluator.evaluate(gbt_train_predictions)
gbt_val_f1 = f1_evaluator.evaluate(gbt_val_predictions)
gbt_test_f1 = f1_evaluator.evaluate(gbt_test_predictions)

print(f"Gradient Boosting AUC - Train: {gbt_train_auc}, Validation: {gbt_val_auc}, Test: {gbt_test_auc}")
print(f"Gradient Boosting Accuracy - Train: {gbt_train_accuracy}, Validation: {gbt_val_accuracy}, Test: {gbt_test_accuracy}")
print(f"Gradient Boosting F1 Score - Train: {gbt_train_f1}, Validation: {gbt_val_f1}, Test: {gbt_test_f1}")

Gradient Boosting AUC - Train: 0.7073215529220184, Validation: 0.6771473990499939, Test: 0.6761514600234316
Gradient Boosting Accuracy - Train: 0.9205777504609711, Validation: 0.9219135936213546, Test: 0.923199551907624
Gradient Boosting F1 Score - Train: 0.8825761677401183, Validation: 0.8844982692049035, Test: 0.8863534729873809


## 3. Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="TARGET", seed=42)
dt_model = dt.fit(X_train)

dt_train_predictions = dt_model.transform(X_train)
dt_val_predictions = dt_model.transform(X_val)
dt_test_predictions = dt_model.transform(X_test)

dt_train_auc = binary_evaluator.evaluate(dt_train_predictions)
dt_val_auc = binary_evaluator.evaluate(dt_val_predictions)
dt_test_auc = binary_evaluator.evaluate(dt_test_predictions)

dt_train_accuracy = accuracy_evaluator.evaluate(dt_train_predictions)
dt_val_accuracy = accuracy_evaluator.evaluate(dt_val_predictions)
dt_test_accuracy = accuracy_evaluator.evaluate(dt_test_predictions)

dt_train_f1 = f1_evaluator.evaluate(dt_train_predictions)
dt_val_f1 = f1_evaluator.evaluate(dt_val_predictions)
dt_test_f1 = f1_evaluator.evaluate(dt_test_predictions)

print(f"Decision Tree AUC - Train: {dt_train_auc}, Validation: {dt_val_auc}, Test: {dt_test_auc}")
print(f"Decision Tree Accuracy - Train: {dt_train_accuracy}, Validation: {dt_val_accuracy}, Test: {dt_test_accuracy}")
print(f"Decision Tree F1 Score - Train: {dt_train_f1}, Validation: {dt_val_f1}, Test: {dt_test_f1}")

Decision Tree AUC - Train: 0.4504909005748263, Validation: 0.44214254129043434, Test: 0.45441470539669887
Decision Tree Accuracy - Train: 0.9205593116164721, Validation: 0.921956926810244, Test: 0.923199551907624
Decision Tree F1 Score - Train: 0.8825549170153326, Validation: 0.8845199005620704, Test: 0.8863534729873809


## 4. XGBoost Classifier

In [None]:
from xgboost.spark import SparkXGBClassifier

In [None]:
xgb = SparkXGBClassifier(
    features_col="features",
    label_col="TARGET",
    prediction_col="prediction",
    probability_col="probability",
    raw_prediction_col="rawPrediction",
    missing=0.0,
    max_depth=6,
    n_estimators=100,
    learning_rate=0.1,
    seed=42,
)

In [None]:
xgb_model = xgb.fit(X_train)

INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': 0.0}
INFO:XGBoost-PySpark:Finished xgboost training!


In [None]:
train_preds = xgb_model.transform(X_train)
val_preds = xgb_model.transform(X_val)
test_preds = xgb_model.transform(X_test)

In [None]:
train_auc = binary_evaluator.evaluate(train_preds)
val_auc = binary_evaluator.evaluate(val_preds)
test_auc = binary_evaluator.evaluate(test_preds)

train_accuracy = accuracy_evaluator.evaluate(train_preds)
val_accuracy = accuracy_evaluator.evaluate(val_preds)
test_accuracy = accuracy_evaluator.evaluate(test_preds)

train_f1 = f1_evaluator.evaluate(train_preds)
val_f1 = f1_evaluator.evaluate(val_preds)
test_f1 = f1_evaluator.evaluate(test_preds)

# Results
print(f"XGBoost AUC - Train: {train_auc:.4f}, Validation: {val_auc:.4f}, Test: {test_auc:.4f}")
print(f"XGBoost Accuracy - Train: {train_accuracy:.4f}, Validation: {val_accuracy:.4f}, Test: {test_accuracy:.4f}")
print(f"XGBoost F1 Score - Train: {train_f1:.4f}, Validation: {val_f1:.4f}, Test: {test_f1:.4f}")

XGBoost AUC - Train: 0.5033, Validation: 0.5010, Test: 0.5010
XGBoost Accuracy - Train: 0.9210, Validation: 0.9220, Test: 0.9232
XGBoost F1 Score - Train: 0.8837, Validation: 0.8849, Test: 0.8867


## Non_tree based model (LogisticRegression Classifier)

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="TARGET", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxIter=100, regParam=0.01)

lr_model = lr.fit(X_train)

lr_train_preds = lr_model.transform(X_train)
lr_val_preds = lr_model.transform(X_val)
lr_test_preds = lr_model.transform(X_test)

binary_evaluator = BinaryClassificationEvaluator(labelCol="TARGET", rawPredictionCol="probability")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="f1")

train_auc = binary_evaluator.evaluate(lr_train_preds)
val_auc = binary_evaluator.evaluate(lr_val_preds)
test_auc = binary_evaluator.evaluate(lr_test_preds)

train_accuracy = accuracy_evaluator.evaluate(lr_train_preds)
val_accuracy = accuracy_evaluator.evaluate(lr_val_preds)
test_accuracy = accuracy_evaluator.evaluate(lr_test_preds)

train_f1 = f1_evaluator.evaluate(lr_train_preds)
val_f1 = f1_evaluator.evaluate(lr_val_preds)
test_f1 = f1_evaluator.evaluate(lr_test_preds)

# Results
print(f"Logistic Regression AUC - Train: {train_auc:.4f}, Validation: {val_auc:.4f}, Test: {test_auc:.4f}")
print(f"Logistic Regression Accuracy - Train: {train_accuracy:.4f}, Validation: {val_accuracy:.4f}, Test: {test_accuracy:.4f}")
print(f"Logistic Regression F1 Score - Train: {train_f1:.4f}, Validation: {val_f1:.4f}, Test: {test_f1:.4f}")

Logistic Regression AUC - Train: 0.6942, Validation: 0.6905, Test: 0.6859
Logistic Regression Accuracy - Train: 0.9204, Validation: 0.9219, Test: 0.9231
Logistic Regression F1 Score - Train: 0.8826, Validation: 0.8848, Test: 0.8865


## K_Fold Cross Validation

## join train & validation

In [None]:
train_val_data = X_train.union(X_val)

## K_Fold CV on XGBClassifier to tune some hyperParm's

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
xgb = SparkXGBClassifier(
    features_col="features",
    label_col="TARGET",
    device="cpu",
    num_workers=2
)


evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", metricName="f1")

paramGrid = (ParamGridBuilder()
    .addGrid(xgb.max_depth, [6, 10])
    .addGrid(xgb.learning_rate, [0.1, 0.3])
    .addGrid(xgb.n_estimators, [100,200])
    .build())


crossval = CrossValidator(
    estimator=xgb,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2
)

In [None]:
cv_model = crossval.fit(train_val_data)

INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 2 workers with
	booster params: {'device': 'cpu', 'learning_rate': 0.3, 'max_depth': 10, 'objective': 'binary:logistic', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 2 workers with
	booster params: {'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 2 workers with
	booster params: {'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost trainin

### try the best model

In [None]:
best_model = cv_model.bestModel

best_params = best_model.extractParamMap()

for param, value in best_params.items():
    print(f"{param.name} = {value}")

enable_sparse_data_optim = False
featuresCol = features
features_cols = []
labelCol = TARGET
predictionCol = prediction
probabilityCol = probability
rawPredictionCol = rawPrediction
arbitrary_params_dict = {}
base_score = None
booster = None
callbacks = None
colsample_bylevel = None
colsample_bynode = None
colsample_bytree = None
device = cpu
early_stopping_rounds = None
eval_metric = None
feature_names = None
feature_types = None
feature_weights = None
force_repartition = False
gamma = None
grow_policy = None
importance_type = None
interaction_constraints = None
iteration_range = None
learning_rate = 0.3
max_bin = None
max_cat_threshold = None
max_cat_to_onehot = None
max_delta_step = None
max_depth = 10
max_leaves = None
min_child_weight = None
missing = nan
monotone_constraints = None
multi_strategy = None
n_estimators = 200
num_parallel_tree = None
num_workers = 2
objective = None
random_state = None
reg_alpha = None
reg_lambda = None
repartition_random_shuffle = False
sampling_met

In [None]:
train_val_predictions = best_model.transform(train_val_data)
test_predictions = best_model.transform(X_test)

train_val_f1 = evaluator.evaluate(train_val_predictions)
test_f1 = evaluator.evaluate(test_predictions)

print(f"Cross-Validated XGBoost F1 - Train+Val: {train_val_f1:.4f}, Test: {test_f1:.4f}")

Cross-Validated XGBoost F1 - Train+Val: 0.9985, Test: 0.8875
