In [1]:
from pyspark.sql import *

In [2]:
spark = SparkSession.builder.appName('myfinishedproj7').getOrCreate()

In [3]:
data = spark.read.csv('gs://imcbucketbbb/loan_select7.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- loan_outcome: integer (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- term: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: double (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- avg_cur_bal: double (nullable = true)
 |-- home_ownership2: string (nullable = true)
 |-- purpose2: string (nullable = true)



In [5]:
# data = data.drop('_c0')

In [65]:
data.show()

+------------+---------+--------+-----+----------+--------------+----------+---------+---------+------------------+-----+--------------------+-----------+-----------+---------------+--------+
|loan_outcome|loan_amnt|int_rate|grade|emp_length|home_ownership|annual_inc|     term|sub_grade|           purpose|  dti|           emp_title|tot_cur_bal|avg_cur_bal|home_ownership2|purpose2|
+------------+---------+--------+-----+----------+--------------+----------+---------+---------+------------------+-----+--------------------+-----------+-----------+---------------+--------+
|           0|     4500|   11.31|    B| 10+ years|          RENT|   38500.0|36 months|       B3|       credit_card| 4.64|Accounts Examiner...|    29137.0|     2428.0|        Not OWN|debt_con|
|           0|    20000|   17.97|    D|   4 years|          RENT|   57000.0|60 months|       D1|debt_consolidation|22.18|   Front office Lead|    33356.0|     3336.0|        Not OWN|debt_con|
|           0|     6600|   11.31|    B| 

In [7]:
data.printSchema()

root
 |-- loan_outcome: integer (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- term: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: double (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- avg_cur_bal: double (nullable = true)
 |-- home_ownership2: string (nullable = true)
 |-- purpose2: string (nullable = true)



In [8]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade' )

In [9]:
data1.show()

+------------+---------------+----------+---------+---------+-----+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|
+------------+---------------+----------+---------+---------+-----+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|
|           0|            OWN|   24000.0|36 months|     2700|    A|
|           0|        Not OWN|   98000.0|36 months|     5000|    A|
|           0|        Not OWN|   55000.0|36 months|    12950|    A|
|           0|        Not OWN|   58000.0|60 months|    10450|    B|
|           0|        Not OWN|   60000.0|60 months|    10000|    B|
|           0|        Not OWN|   68000.0|36 months|    29050|    B|
|           0|        Not OWN|   42140.0|36 mont

In [10]:
data1.printSchema()

root
 |-- loan_outcome: integer (nullable = true)
 |-- home_ownership2: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- term: string (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- grade: string (nullable = true)



In [11]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [102]:
purpose_indexer = StringIndexer(inputCol='purpose',outputCol='purposeIndex')
purpose_encoder = OneHotEncoder(inputCol='purposeIndex',outputCol='purposeVec')

In [12]:
grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

In [12]:
emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

In [13]:
home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

In [14]:
term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

In [15]:
assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec'],outputCol='features')

In [16]:
from pyspark.ml.classification import LogisticRegression

In [17]:
from pyspark.ml.classification import *

## Pipelines 

In [18]:
from pyspark.ml import Pipeline

In [19]:
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')

In [20]:
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                           assembler,log_reg_loan])

In [21]:
train_data, test_data = data1.randomSplit([0.7,0.3])

In [22]:
fit_model = pipeline.fit(train_data)

In [23]:
results = fit_model.transform(test_data)

In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [25]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')

In [26]:
results.select('loan_outcome','prediction').show()

+------------+----------+
|loan_outcome|prediction|
+------------+----------+
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
|           0|       0.0|
+------------+----------+
only showing top 20 rows



In [27]:
AUC = my_eval.evaluate(results)

In [28]:
AUC

0.5131513358172877

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")

In [31]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")

In [32]:
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")

In [33]:
# Select (prediction, true label) and compute test error
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

In [34]:
results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

In [35]:
results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

In [36]:
print("Here are the results!")
print('-'*80)
print('A Logistic Regression Model had an accuracy of: {0:2.2f}%'.format(results_acc*100))
print('A Logistic Regression Model had an precision score of: {0:2.2f}%'.format(results_precision*100))
print('A Logistic Regression Model had an recall score of: {0:2.2f}%'.format(results_recall*100))
print('A Logistic Regression Model had an f1 score of: {0:2.2f}%'.format(results_f1*100))

Here are the results!
--------------------------------------------------------------------------------
A Logistic Regression Model had an accuracy of: 79.45%
A Logistic Regression Model had an precision score of: 74.07%
A Logistic Regression Model had an recall score of: 79.45%
A Logistic Regression Model had an f1 score of: 71.58%


In [38]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2' )
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec'],outputCol='features')
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,log_reg_loan])

train_data, test_data = data1.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC = my_eval.evaluate(results)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

print("The results for Logistic Regression Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC*100))

+------------+---------------+----------+---------+---------+-----+--------+--------+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|
+------------+---------------+----------+---------+---------+-----+--------+--------+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt_con|
|           0|        Not OWN|   98000.0|36 months|     5000|    A|    7.56|debt_con|
|           0|        Not OWN|   55000.0|36 months|    12950|    A|    7.56|debt_con|
|           0|        Not OWN|   58000.0|60 months|   

In [40]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2','tot_cur_bal' )
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec','tot_cur_bal'],outputCol='features')
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,log_reg_loan])

train_data, test_data = data1.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC = my_eval.evaluate(results)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

print("The results for Logistic Regression Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC*100))

+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|tot_cur_bal|
+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|    29137.0|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|    33356.0|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|    26836.0|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|    18649.0|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|   106556.0|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt_con|    75363.0|
|           0|        Not OWN|   98000.0|36 months|     5000|    A|    7.56|debt_con|    50831.0|
|           0|      

In [41]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2','avg_cur_bal' )
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec','avg_cur_bal'],outputCol='features')
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,log_reg_loan])

train_data, test_data = data1.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC = my_eval.evaluate(results)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

print("The results for Logistic Regression Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC*100))

+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|avg_cur_bal|
+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|     2428.0|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|     3336.0|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|     4473.0|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|     6216.0|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|     9687.0|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt_con|     3589.0|
|           0|        Not OWN|   98000.0|36 months|     5000|    A|    7.56|debt_con|     8472.0|
|           0|      

In [42]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2','avg_cur_bal', 'dti' )
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec','avg_cur_bal','dti'],outputCol='features')
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,log_reg_loan])

train_data, test_data = data1.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC = my_eval.evaluate(results)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

print("The results for Logistic Regression Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC*100))

+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|avg_cur_bal|  dti|
+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|     2428.0| 4.64|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|     3336.0|22.18|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|     4473.0|16.21|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|     6216.0|15.09|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|     9687.0| 19.1|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt_con|     3589.0|24.95|
|           0|        Not OWN|   98000.0|36 months|     5000|   

In [45]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2','avg_cur_bal', 'dti' ,'emp_length')
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec','avg_cur_bal','dti','emp_lengthVec'],outputCol='features')
log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
pipeline = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                            emp_length_indexer,emp_length_encoder,
                           assembler,log_reg_loan])

train_data, test_data = data1.randomSplit([0.7,0.3])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC = my_eval.evaluate(results)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(results)
results_f1 = f1_evaluator.evaluate(results)

results_precision = precision_evaluator.evaluate(results)
results_recall = recall_evaluator.evaluate(results)

print("The results for Logistic Regression Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC*100))

+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+----------+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|avg_cur_bal|  dti|emp_length|
+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+----------+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|     2428.0| 4.64| 10+ years|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|     3336.0|22.18|   4 years|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|     4473.0|16.21| 10+ years|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|     6216.0|15.09|   5 years|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|     9687.0| 19.1|   5 years|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt

In [55]:
rdd = sc.parallelize(
    [
        (0., 1.), 
        (0., 0.), 
        (0., 0.), 
        (1., 1.), 
        (1.,0.), 
        (1.,0.),
        (1.,1.),
        (1.,1.)
    ]
)
df = sqlContext.createDataFrame(,["prediction", "loan_outcome"])
df.show()


SyntaxError: invalid syntax (<ipython-input-55-cb2884c51a39>, line 13)

In [46]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [47]:
data1 = data.select('loan_outcome','home_ownership2','annual_inc','term','loan_amnt','grade','int_rate','purpose2','avg_cur_bal', 'dti' )
data1.show()
data1.printSchema()
purpose2_indexer = StringIndexer(inputCol='purpose2',outputCol='purpose2Index')
purpose2_encoder = OneHotEncoder(inputCol='purpose2Index',outputCol='purpose2Vec')

grade_indexer = StringIndexer(inputCol='grade',outputCol='gradeIndex')
grade_encoder = OneHotEncoder(inputCol='gradeIndex',outputCol='gradeVec')

emp_length_indexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndex')
emp_length_encoder = OneHotEncoder(inputCol='emp_lengthIndex',outputCol='emp_lengthVec')

home_ownership_indexer = StringIndexer(inputCol='home_ownership2',outputCol='home_ownershipIndex')
home_ownership_encoder = OneHotEncoder(inputCol='home_ownershipIndex',outputCol='home_ownershipVec')

term_indexer = StringIndexer(inputCol='term',outputCol='termIndex')
term_encoder = OneHotEncoder(inputCol='termIndex',outputCol='termVec')

assembler = VectorAssembler(inputCols=['loan_amnt' , 'gradeVec' , 'home_ownershipVec' , 'annual_inc' , 'termVec','int_rate','purpose2Vec','avg_cur_bal','dti'],outputCol='features')
#log_reg_loan = LogisticRegression(featuresCol='features',labelCol='loan_outcome')
# Use mostly defaults to make this comparison "fair"



+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+
|loan_outcome|home_ownership2|annual_inc|     term|loan_amnt|grade|int_rate|purpose2|avg_cur_bal|  dti|
+------------+---------------+----------+---------+---------+-----+--------+--------+-----------+-----+
|           0|        Not OWN|   38500.0|36 months|     4500|    B|   11.31|debt_con|     2428.0| 4.64|
|           0|        Not OWN|   57000.0|60 months|    20000|    D|   17.97|debt_con|     3336.0|22.18|
|           0|        Not OWN|   45000.0|36 months|     6600|    B|   11.31|debt_con|     4473.0|16.21|
|           0|        Not OWN|   42000.0|36 months|     2500|    C|   13.56|   other|     6216.0|15.09|
|           0|        Not OWN|   60000.0|36 months|     4000|    D|   17.97|   other|     9687.0| 19.1|
|           0|            OWN|   24000.0|36 months|     2700|    A|    8.19|debt_con|     3589.0|24.95|
|           0|        Not OWN|   98000.0|36 months|     5000|   

In [60]:
dtc = DecisionTreeClassifier(labelCol='loan_outcome',featuresCol='features')
#rfc = RandomForestClassifier(labelCol='loan_outcome',featuresCol='features')
#gbt = GBTClassifier(labelCol='loan_outcome',featuresCol='features')

pipeline_dtc = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,dtc])
'''
pipeline_rfc = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,rfc])

pipeline_gbt = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,gbt])'''

train_data, test_data = data1.randomSplit([0.7,0.3])

# Train the models (its three models, so it might take some time)
dtc_model = pipeline_dtc.fit(train_data)
#rfc_model = pipeline_rfc.fit(train_data)
#gbt_model = pipeline_gbt.fit(train_data)

dtc_predictions = dtc_model.transform(test_data)
#rfc_predictions = rfc_model.transform(test_data)
#gbt_predictions = gbt_model.transform(test_data)





#fit_model = pipeline.fit(train_data)
#results = fit_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC1 = my_eval.evaluate(dtc_predictions)
#AUC2 = my_eval.evaluate(rfc_predictions)
#AUC3 = my_eval.evaluate(gbt_predictions)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(dtc_predictions)
results_f1 = f1_evaluator.evaluate(dtc_predictions)
results_precision = precision_evaluator.evaluate(dtc_predictions)
results_recall = recall_evaluator.evaluate(dtc_predictions)

print("The results for Decision Tree Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC1*100))

The results for Decision Tree Model
--------------------------------------------------------------------------------
Accuracy: 79.48%
Precision score: 74.33%
Recall score: 79.48%
F1 score: 73.09%
Area Under the Curve: 53.07%


In [66]:
rfc = RandomForestClassifier(labelCol='loan_outcome',featuresCol='features',numTrees=50)


pipeline_rfc = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,rfc])
train_data, test_data = data1.randomSplit([0.7,0.3])
rfc_model = pipeline_rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')
AUC2 = my_eval.evaluate(rfc_predictions)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(rfc_predictions)
results_f1 = f1_evaluator.evaluate(rfc_predictions)
results_precision = precision_evaluator.evaluate(rfc_predictions)
results_recall = recall_evaluator.evaluate(rfc_predictions)

print("The results for Random Forest Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC2*100))

The results for Random Forest Model
--------------------------------------------------------------------------------
Accuracy: 79.40%
Precision score: 75.23%
Recall score: 79.40%
F1 score: 70.42%
Area Under the Curve: 50.17%


In [62]:
gbt = GBTClassifier(labelCol='loan_outcome',featuresCol='features')
pipeline_gbt = Pipeline(stages=[grade_indexer,grade_encoder,   
                            home_ownership_indexer,home_ownership_encoder,
                            term_indexer,term_encoder,
                            purpose2_indexer,purpose2_encoder,
                           assembler,gbt])
train_data, test_data = data1.randomSplit([0.7,0.3])
gbt_model = pipeline_gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='loan_outcome')

AUC3 = my_eval.evaluate(gbt_predictions)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="loan_outcome", predictionCol="prediction", metricName="f1")

results_acc = acc_evaluator.evaluate(gbt_predictions)
results_f1 = f1_evaluator.evaluate(gbt_predictions)
results_precision = precision_evaluator.evaluate(gbt_predictions)
results_recall = recall_evaluator.evaluate(gbt_predictions)

print("The results for Gradient Boosting Model")
print('-'*80)
print('Accuracy: {0:2.2f}%'.format(results_acc*100))
print('Precision score: {0:2.2f}%'.format(results_precision*100))
print('Recall score: {0:2.2f}%'.format(results_recall*100))
print('F1 score: {0:2.2f}%'.format(results_f1*100))
print('Area Under the Curve: {0:2.2f}%'.format(AUC3*100))

The results for Gradient Boosting Model
--------------------------------------------------------------------------------
Accuracy: 79.49%
Precision score: 74.62%
Recall score: 79.49%
F1 score: 72.68%
Area Under the Curve: 52.70%
