In [1]:
!pip install pyspark



In [80]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import Imputer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [3]:
spark = SparkSession.builder.appName("mlbuilder").getOrCreate()

In [4]:
training_data = spark.read.csv('cs-training.csv',header=True,inferSchema=True)
test_data = spark.read.csv('cs-test.csv',header=True,inferSchema=True)

In [5]:
training_data = training_data.drop('Unnamed: 0')
test_data = test_data.drop('Unnamed: 0')

In [6]:
test_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- SeriousDlqin2yrs: string (nullable = true)
 |-- RevolvingUtilizationOfUnsecuredLines: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: integer (nullable = true)
 |-- DebtRatio: double (nullable = true)
 |-- MonthlyIncome: string (nullable = true)
 |-- NumberOfOpenCreditLinesAndLoans: integer (nullable = true)
 |-- NumberOfTimes90DaysLate: integer (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60-89DaysPastDueNotWorse: integer (nullable = true)
 |-- NumberOfDependents: string (nullable = true)



In [7]:
string_to_number = ['MonthlyIncome','NumberOfDependents']

In [8]:
for col_name in string_to_number:
    training_data = training_data.withColumn(col_name,col(col_name).cast('int'))
    test_data = test_data.withColumn(col_name,col(col_name).cast('int'))

In [9]:
imputer = Imputer(inputCols=['MonthlyIncome', 'NumberOfDependents'], outputCols=['MonthlyIncome', 'NumberOfDependents'])
training_data = imputer.fit(training_data).transform(training_data)
test_data = imputer.fit(test_data).transform(test_data)

In [10]:
training_data.show()

+---+----------------+------------------------------------+---+------------------------------------+-----------+-------------+-------------------------------+-----------------------+----------------------------+------------------------------------+------------------+
|_c0|SeriousDlqin2yrs|RevolvingUtilizationOfUnsecuredLines|age|NumberOfTime30-59DaysPastDueNotWorse|  DebtRatio|MonthlyIncome|NumberOfOpenCreditLinesAndLoans|NumberOfTimes90DaysLate|NumberRealEstateLoansOrLines|NumberOfTime60-89DaysPastDueNotWorse|NumberOfDependents|
+---+----------------+------------------------------------+---+------------------------------------+-----------+-------------+-------------------------------+-----------------------+----------------------------+------------------------------------+------------------+
|  1|               1|                         0.766126609| 45|                                   2|0.802982129|         9120|                             13|                      0|              

In [11]:
minority_class = training_data.filter(training_data.SeriousDlqin2yrs == 1)
majority_class = training_data.filter(training_data.SeriousDlqin2yrs == 0)

In [33]:
print('minority:',minority_class.count(),'majority:',majority_class.count())

minority: 10026 majority: 139974


In [13]:
majority_class_downsampled = majority_class.sample(withReplacement=False, fraction=float(minority_class.count()) / float(majority_class.count()))


In [14]:
balanced_training_data = minority_class.union(majority_class_downsampled)


In [15]:
balanced_training_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- SeriousDlqin2yrs: integer (nullable = true)
 |-- RevolvingUtilizationOfUnsecuredLines: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: integer (nullable = true)
 |-- DebtRatio: double (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- NumberOfOpenCreditLinesAndLoans: integer (nullable = true)
 |-- NumberOfTimes90DaysLate: integer (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60-89DaysPastDueNotWorse: integer (nullable = true)
 |-- NumberOfDependents: integer (nullable = true)



In [16]:
features_columns = ['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans',
                    'NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse','NumberOfDependents']

In [17]:
asserter = VectorAssembler(inputCols=features_columns,outputCol='features',handleInvalid='skip')

In [18]:
balanced_training_data= asserter.transform(balanced_training_data)

In [19]:
balanced_training_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- SeriousDlqin2yrs: integer (nullable = true)
 |-- RevolvingUtilizationOfUnsecuredLines: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: integer (nullable = true)
 |-- DebtRatio: double (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- NumberOfOpenCreditLinesAndLoans: integer (nullable = true)
 |-- NumberOfTimes90DaysLate: integer (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60-89DaysPastDueNotWorse: integer (nullable = true)
 |-- NumberOfDependents: integer (nullable = true)
 |-- features: vector (nullable = true)



In [20]:
test = asserter.transform(test_data)

In [21]:
selected_train = balanced_training_data.select('features','SeriousDlqin2yrs')
selected_test = selected_train.select('features','SeriousDlqin2yrs')

In [22]:
rf = RandomForestClassifier(labelCol='SeriousDlqin2yrs',featuresCol='features',numTrees=100,maxDepth=7)

In [23]:
model = rf.fit(selected_train)

In [24]:
predictions = model.transform(selected_test)

In [25]:
predictions.show()

+--------------------+----------------+--------------------+--------------------+----------+
|            features|SeriousDlqin2yrs|       rawPrediction|         probability|prediction|
+--------------------+----------------+--------------------+--------------------+----------+
|[0.766126609,45.0...|               1|[19.3875208507234...|[0.19387520850723...|       1.0|
|[0.964672555,40.0...|               1|[5.71758149077174...|[0.05717581490771...|       1.0|
|[0.025655677,38.0...|               1|[82.3508737872690...|[0.82350873787269...|       0.0|
|[0.392248482,50.0...|               1|[64.9198257343337...|[0.64919825734333...|       0.0|
|[0.728150491,31.0...|               1|[43.7152144463944...|[0.43715214446394...|       1.0|
|[0.13306279,49.0,...|               1|[55.9085110909077...|[0.55908511090907...|       0.0|
|[0.734477501,45.0...|               1|[42.5822006038673...|[0.42582200603867...|       1.0|
|[1.046279103,47.0...|               1|[21.6405757326391...|[0.2164057

In [48]:
train_prediction = model.transform(selected_train)

In [77]:
def evaluate_model(predicts):
    evaluators = BinaryClassificationEvaluator(labelCol="SeriousDlqin2yrs", rawPredictionCol="prediction",metricName="areaUnderROC")
    accuracy = evaluators.evaluate(predicts)

    confusion_matrix = predicts.groupBy('SeriousDlqin2yrs', 'prediction').count()
    confusion_matrix.show()

    tp = confusion_matrix.filter((col('SeriousDlqin2yrs') == 1) & (col('prediction') == 1)).select('count').collect()[0][0]
    fp = confusion_matrix.filter((col('SeriousDlqin2yrs') == 0) & (col('prediction') == 1)).select('count').collect()[0][0]
    tn = confusion_matrix.filter((col('SeriousDlqin2yrs') == 0) & (col('prediction') == 0)).select('count').collect()[0][0]
    fn = confusion_matrix.filter((col('SeriousDlqin2yrs') == 1) & (col('prediction') == 0)).select('count').collect()[0][0]

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1_score}")

In [78]:
evaluate_model(predictions)

+----------------+----------+-----+
|SeriousDlqin2yrs|prediction|count|
+----------------+----------+-----+
|               1|       0.0| 1954|
|               1|       1.0| 8072|
|               0|       0.0| 7811|
|               0|       1.0| 2282|
+----------------+----------+-----+

Accuracy: 0.7895047136831932
Precision: 0.77960208615028
Recall: 0.8051067225214442
F1-Score: 0.7921491658488714


In [79]:
evaluate_model(train_prediction)

+----------------+----------+-----+
|SeriousDlqin2yrs|prediction|count|
+----------------+----------+-----+
|               1|       0.0| 1954|
|               1|       1.0| 8072|
|               0|       0.0| 7811|
|               0|       1.0| 2282|
+----------------+----------+-----+

Accuracy: 0.7895047136831932
Precision: 0.77960208615028
Recall: 0.8051067225214442
F1-Score: 0.7921491658488714


In [53]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50, 100]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

In [54]:
evaluator = BinaryClassificationEvaluator(labelCol="SeriousDlqin2yrs", rawPredictionCol="prediction", metricName="areaUnderROC")

In [66]:
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=10)

In [67]:
cvModel = crossval.fit(selected_train)

In [68]:
trainCV_predictions = cvModel.transform(balanced_training_data)
testCV_predictions = cvModel.transform(selected_test)

In [75]:
print('Train data performance:')
evaluate_model(trainCV_predictions)

Train data performance:
+----------------+----------+-----+
|SeriousDlqin2yrs|prediction|count|
+----------------+----------+-----+
|               1|       0.0| 1893|
|               1|       1.0| 8133|
|               0|       0.0| 8236|
|               0|       1.0| 1857|
+----------------+----------+-----+

Accuracy: 0.8136090262935534
Precision: 0.8141141141141142
Recall: 0.8111909036505087
F1-Score: 0.8126498800959233


In [76]:
print('Test data performance:')
evaluate_model(testCV_predictions)


Test data performance:
+----------------+----------+-----+
|SeriousDlqin2yrs|prediction|count|
+----------------+----------+-----+
|               1|       0.0| 1893|
|               1|       1.0| 8133|
|               0|       0.0| 8236|
|               0|       1.0| 1857|
+----------------+----------+-----+

Accuracy: 0.8136090262935534
Precision: 0.8141141141141142
Recall: 0.8111909036505087
F1-Score: 0.8126498800959233


In [71]:
bestModel = cvModel.bestModel
print("Best Param (numTrees): ", bestModel.getOrDefault("numTrees"))
print("Best Param (maxDepth): ", bestModel.getOrDefault("maxDepth"))

Best Param (numTrees):  50
Best Param (maxDepth):  10
