In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName('Logistic Regression').getOrCreate()
spark

In [3]:
training = spark.read.csv("Data\loan_data.csv", header = True, inferSchema = True)
training.printSchema()


root
 |-- credit_policy: integer (nullable = true)
 |-- purpose: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- log_annual_inc: double (nullable = true)
 |-- dti: double (nullable = true)
 |-- fico: integer (nullable = true)
 |-- days_with_cr_line: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- revol_util: double (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- not_fully_paid: integer (nullable = true)



In [4]:
training.head(3)

[Row(credit_policy=1, purpose='debt_consolidation', int_rate=0.1189, installment=829.1, log_annual_inc=11.35040654, dti=19.48, fico=737, days_with_cr_line=5639.958333, revol_bal=28854, revol_util=52.1, inq_last_6mths=0, delinq_2yrs=0, pub_rec=0, not_fully_paid=0),
 Row(credit_policy=1, purpose='credit_card', int_rate=0.1071, installment=228.22, log_annual_inc=11.08214255, dti=14.29, fico=707, days_with_cr_line=2760.0, revol_bal=33623, revol_util=76.7, inq_last_6mths=0, delinq_2yrs=0, pub_rec=0, not_fully_paid=0),
 Row(credit_policy=1, purpose='debt_consolidation', int_rate=0.1357, installment=366.86, log_annual_inc=10.37349118, dti=11.63, fico=682, days_with_cr_line=4710.0, revol_bal=3511, revol_util=25.6, inq_last_6mths=1, delinq_2yrs=0, pub_rec=0, not_fully_paid=0)]

In [5]:
data = training.drop("purpose")
data.show()

+-------------+--------+-----------+--------------+-----+----+-----------------+---------+----------+--------------+-----------+-------+--------------+
|credit_policy|int_rate|installment|log_annual_inc|  dti|fico|days_with_cr_line|revol_bal|revol_util|inq_last_6mths|delinq_2yrs|pub_rec|not_fully_paid|
+-------------+--------+-----------+--------------+-----+----+-----------------+---------+----------+--------------+-----------+-------+--------------+
|            1|  0.1189|      829.1|   11.35040654|19.48| 737|      5639.958333|    28854|      52.1|             0|          0|      0|             0|
|            1|  0.1071|     228.22|   11.08214255|14.29| 707|           2760.0|    33623|      76.7|             0|          0|      0|             0|
|            1|  0.1357|     366.86|   10.37349118|11.63| 682|           4710.0|     3511|      25.6|             1|          0|      0|             0|
|            1|  0.1008|     162.34|   11.35040654|  8.1| 712|      2699.958333|    3366

In [6]:
data.columns

['credit_policy',
 'int_rate',
 'installment',
 'log_annual_inc',
 'dti',
 'fico',
 'days_with_cr_line',
 'revol_bal',
 'revol_util',
 'inq_last_6mths',
 'delinq_2yrs',
 'pub_rec',
 'not_fully_paid']

In [34]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['credit_policy',
                                              'int_rate',
                                              'installment',
                                              'log_annual_inc',
                                              'dti',
                                              'fico',
                                              'days_with_cr_line',
                                              'revol_bal',
                                              'revol_util',
                                              'inq_last_6mths',
                                              'delinq_2yrs',
                                              'pub_rec'], outputCol='features')


In [35]:
output = featureassembler.transform(data)
output.show()

+-------------+--------+-----------+--------------+-----+----+-----------------+---------+----------+--------------+-----------+-------+--------------+--------------------+
|credit_policy|int_rate|installment|log_annual_inc|  dti|fico|days_with_cr_line|revol_bal|revol_util|inq_last_6mths|delinq_2yrs|pub_rec|not_fully_paid|            features|
+-------------+--------+-----------+--------------+-----+----+-----------------+---------+----------+--------------+-----------+-------+--------------+--------------------+
|            1|  0.1189|      829.1|   11.35040654|19.48| 737|      5639.958333|    28854|      52.1|             0|          0|      0|             0|[1.0,0.1189,829.1...|
|            1|  0.1071|     228.22|   11.08214255|14.29| 707|           2760.0|    33623|      76.7|             0|          0|      0|             0|[1.0,0.1071,228.2...|
|            1|  0.1357|     366.86|   10.37349118|11.63| 682|           4710.0|     3511|      25.6|             1|          0|      0

In [40]:
#Feature Selection
finalized_data = output.select("features", "not_fully_paid")
finalized_data.show()

+--------------------+--------------+
|            features|not_fully_paid|
+--------------------+--------------+
|[1.0,0.1189,829.1...|             0|
|[1.0,0.1071,228.2...|             0|
|[1.0,0.1357,366.8...|             0|
|[1.0,0.1008,162.3...|             0|
|[1.0,0.1426,102.9...|             0|
|[1.0,0.0788,125.1...|             0|
|[1.0,0.1496,194.0...|             1|
|[1.0,0.1114,131.2...|             1|
|[1.0,0.1134,87.19...|             0|
|[1.0,0.1221,84.12...|             0|
|[1.0,0.1347,360.4...|             0|
|[1.0,0.1324,253.5...|             0|
|[1.0,0.0859,316.1...|             0|
|[1.0,0.0714,92.82...|             0|
|[1.0,0.0863,209.5...|             0|
|[1.0,0.1103,327.5...|             0|
|[1.0,0.1317,77.69...|             0|
|[1.0,0.0894,476.5...|             0|
|[1.0,0.1039,584.1...|             0|
|[1.0,0.1513,173.6...|             0|
+--------------------+--------------+
only showing top 20 rows



In [42]:
train, test = finalized_data.randomSplit([0.7, 0.3], 12345)

In [44]:
lr = LogisticRegression(featuresCol="features", labelCol = "not_fully_paid" , maxIter=10,regParam=0.3, elasticNetParam=0.8)
LogisticRegression()

#fit the model
lrModel = lr.fit(train)


In [45]:
#Print the coefficients and intercept
print("Coefficients: " + {str(lrModel.coefficients)})
print("Intercept: " + {str(lrModel.intercept)})

Coefficients: (12,[],[])
Intercept: -1.637456012683671


### Logistic Summary

In [50]:
#Extract the summary from the returned LogisticRegressionModel
trainingSummary = lrModel.summary

In [51]:
#Obtain the Objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("ObjectiveHistory")
for objective in objectiveHistory:
    print(objective)

ObjectiveHistory
0.4443027529093511


In [52]:
#Obtain the reciever-operating characteristics as a Dataframe and areaUnderROC
trainingSummary.roc.show()
print(f"areaUnderROC: {str(trainingSummary.areaUnderROC)}")



+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5
