## Imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when, col
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pprint import pprint

## Variables

In [2]:
path = f'../data/processed/loans_spark/part-*.csv'

target = 'class'

## Spark Session

In [3]:
spark = SparkSession.builder.appName('LogisticRegression').getOrCreate()

## Spark Functions

In [4]:
def get_features(data, spark_session, target):
    output_name = f'features'
    features = VectorAssembler(
    inputCols = [var for var in data.columns if var not in target],
    outputCol = output_name
    )
    
    output = features.transform(data)
    
    output.createTempView(output_name)
    
    finalised_data = spark_session.sql(f'''
    select
    features,
    {target} as label
    from {output_name}
    ''')
    
    return finalised_data

def weight_balance(data, labels):
    ratio = 1-data.groupby('label').count().collect()[0][1] / data.groupby('label').count().collect()[1][1]
    return when(labels == 1, ratio).otherwise(1*(1-ratio))


def evaluate_model(model, test_set, spark_session):
    test_set_name = 'test'
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    test_preds = model.transform(test_set)
    
    auc = evaluator.evaluate(test_preds)
    
    test_preds.createTempView(test_set_name)
    
    metrics = spark_session.sql(f'''
        select
        round({auc}*100, 3) as auc,
        round((true_positive / (true_positive + false_negative))*100, 3) as recall_pct,
        round((true_positive / (true_positive + false_positive))*100, 3) as precision_pct,
        *


        from 

        ( select
        round(avg(case when label=prediction then 1 else 0 end)*100, 3) as accuracy_pct,
        sum(case when label=1 and prediction=1 then 1 else 0 end) as true_positive,
        sum(case when label=0 and prediction=0 then 1 else 0 end) as true_negative,
        sum(case when label=0 and prediction=1 then 1 else 0 end) as false_positive,
        sum(case when label=1 and prediction=0 then 1 else 0 end) as false_negative

        from {test_set_name} )
            ''')
    
    metrics.show()
    
    return metrics
    

## Read data

In [5]:
data = spark.read.csv(
    path,
    inferSchema=True,
    header=True
)

In [6]:
data.printSchema()

root
 |-- class: integer (nullable = true)
 |-- binary_term: double (nullable = true)
 |-- purpose_other: integer (nullable = true)
 |-- purpose_debt_consolidation: integer (nullable = true)
 |-- purpose_credit_card: integer (nullable = true)
 |-- home_ownership_own: integer (nullable = true)
 |-- home_ownership_other: integer (nullable = true)
 |-- home_ownership_mortgage: integer (nullable = true)
 |-- home_ownership_rent: integer (nullable = true)
 |-- employment_length_7to9: integer (nullable = true)
 |-- employment_length_10plus: integer (nullable = true)
 |-- employment_length_1to3: integer (nullable = true)
 |-- employment_length_less1: integer (nullable = true)
 |-- employment_length_4to6: integer (nullable = true)
 |-- inquiries_6m_1_inquiry: integer (nullable = true)
 |-- inquiries_6m_2plus_inquiry: integer (nullable = true)
 |-- inquiries_6m_no_inquiry: integer (nullable = true)
 |-- scaled_installment: double (nullable = true)
 |-- scaled_loan_amount: double (nullable = tru

## Get Features

In [7]:
data = get_features(data, spark, target)

In [8]:
data = data.withColumn('weights', weight_balance(data, col('label')))

In [9]:
data.show(10)

+--------------------+-----+------------------+
|            features|label|           weights|
+--------------------+-----+------------------+
|(30,[1,6,9,15,16,...|    0|0.0496019035416857|
|(30,[1,7,10,13,16...|    0|0.0496019035416857|
|(30,[2,4,12,15,16...|    0|0.0496019035416857|
|(30,[2,7,11,15,16...|    0|0.0496019035416857|
|(30,[2,7,9,14,16,...|    0|0.0496019035416857|
|(30,[1,7,8,13,16,...|    0|0.0496019035416857|
|(30,[1,7,10,14,16...|    0|0.0496019035416857|
|(30,[1,7,11,14,16...|    0|0.0496019035416857|
|(30,[2,6,8,15,16,...|    0|0.0496019035416857|
|(30,[1,7,11,14,16...|    0|0.0496019035416857|
+--------------------+-----+------------------+
only showing top 10 rows



In [10]:
train, test = data.randomSplit([.8, .2], seed=1234)

## Apply ML

In [11]:
lr = LogisticRegression(
    featuresCol = data.columns[0],
    labelCol = data.columns[1],
    weightCol = data.columns[2],
    maxIter = 100,
)

In [12]:
pipeline = Pipeline(stages=[lr])

In [13]:
paramGrid = ParamGridBuilder() \
.addGrid(lr.regParam, [0.001, 0.01, 0.1, 1]) \
.addGrid(lr.elasticNetParam, [0.001, 0.01, 0.1, 1]) \
.build()

In [14]:
model_tune = TrainValidationSplit(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
                      trainRatio=0.8)  

In [15]:
model = model_tune.fit(train)

In [16]:
metrics = evaluate_model(model, test, spark)

+------+----------+-------------+------------+-------------+-------------+--------------+--------------+
|   auc|recall_pct|precision_pct|accuracy_pct|true_positive|true_negative|false_positive|false_negative|
+------+----------+-------------+------------+-------------+-------------+--------------+--------------+
|73.172|    63.259|        9.851|      70.627|         1250|        28000|         11439|           726|
+------+----------+-------------+------------+-------------+-------------+--------------+--------------+

