# Random Forest : Prosper Loan Dataset

A decision tree a learned set of rules that allows us to make decisions on data.

We are going to look at the prosper loan dataset.  This dataset shows a history of loans made by Prosper.

In [None]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

## Step 1: Load the Data

In [None]:
## small file, start with this
datafile = "/data/prosper-loan/prosper-loan-data-sample.csv"

## this is a large file
#datafile = "/data/prosper-loan/prosper-loan-data.csv.gz"

In [None]:
%%time

data = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv(datafile)

In [None]:
print("read {:,} records".format(data.count()))
# schema
data.printSchema()

In [None]:
## print with pandas
data.limit(10).toPandas()

In [None]:
## select a few columns 
## start with: 'LoanStatus',  'EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome'
## we add more later

select_columns = ['LoanStatus',  'ProsperScore', 'EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome', 'ListingCategory']

## Note : vector columns can only have Numbers, don't include Categorical columns here
## And dfefinitely not 'LoanStatus'  (if you are curiuos include and see what happens!)
vector_columns = [  'ProsperScore', 'EmpIndex', 'CreditScore', 'StatedMonthlyIncome']



In [None]:
## Display

prosper = data.select(select_columns)  
prosper.printSchema()

prosper.limit(10).toPandas()

## Step 2 : Clean Data

In [None]:
# Drop any NA values.  Using `dataframe.na.drop()`
prosper_clean = prosper.na.drop()
print("Original record count {:,}, cleaned records count {:,},  dropped {:,}"\
      .format(prosper.count(), prosper_clean.count(), (prosper.count() - prosper_clean.count())))
prosper_clean.show()


### Look at some summary data

In [None]:
prosper_clean.groupBy('LoanStatus').count().show()
prosper_clean.groupBy('EmploymentStatus').count().show()

**=> What does that say about the cardinality of these categorical columns? ***



## Step 3: Converting Categorical columns 

Convert categorical columns to numeric.   
Here let's convert **EmploymentStatus** column

In [None]:
from pyspark.ml.feature import StringIndexer

strIndexer_employment = StringIndexer(inputCol="EmploymentStatus", outputCol="EmpIndex")
prosper_indexed = strIndexer_employment.fit(prosper_clean).transform(prosper_clean)

prosper_indexed.limit(10).toPandas()


## Step 4: Build feature vectors using VectorAssembler.

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=vector_columns, outputCol="features")
feature_vector = assembler.transform(prosper_indexed)
feature_vector = feature_vector.withColumn("label", feature_vector["LoanStatus"])

feature_vector.limit(10).toPandas()

## Step 5: Split Data into training and test.

We will split our the data up into training and test.  (You know the drill by now).

**=> TODO: Split dataset into 70% training, 30% validation**


In [None]:
# Split the data into training and test sets (30% held out for testing)
(training, test) =  feature_vector.randomSplit([???,???])
print("training set = " , training.count())
print("testing set = " , test.count())

## Step 6: Random Forest

### 6.1 Create RF

In [2]:
from pyspark.ml.classification import RandomForestClassifier

## TODO : Create a RandomForest with numTrees=20  and maxBins=10000

# rf = RandomForestRegressor(labelCol="label", featuresCol="features", \
#                             numTrees=20, maxBins=10000)

ModuleNotFoundError: No module named 'pyspark'

### 6.2 Train the RF

In [None]:
%%time
print ("training starting...")
## TODO : train in training data
rf_model = rf.fit(???)
print ("training done.")

## TODO : NOtice the time it took for training
## Is it more or less than decision trees?

### 6.3 Print RF
**==>Q : How many nodes the tree has?**

In [None]:
## print the model
print(rf_model)
print()
print(rf_model.toDebugString)

### 6.4 Create Predictions

In [None]:
## TODO : predict on test data
predictions = rf_model.transform(???)

predictions2= predictions.drop('rawPrediction', 'probability')
predictions2.show()


## Step 7: Evaluate the model.

Let us check to see how the model did, using accuracy as a measure.

In [None]:
predictions_test = rf_model.transform(???)  # Hint : test
predictions_train = rf_model.transform(???)  # Hint : training

### 7.1 Accuracy

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

print("Training set accuracy = " , evaluator.evaluate(predictions_train))
print("Test set accuracy = " , evaluator.evaluate(predictions_test))

#### Is RF more stable than DTs ?
Do a few runs, and see the accuracy above.   
Does it vary a lot like Decision Trees before?   
Probably not.  Why do you think it is?

### 7.2 Confusion Matrix

In [None]:
cm = predictions_test.groupBy('LoanStatus').pivot('prediction', [0,1]).count().na.fill(0).orderBy('LoanStatus')
cm.show()

In [None]:
import seaborn as sns

cm_pd = cm.toPandas()
cm_pd.set_index("LoanStatus", inplace=True)
# print(cm_pd)

# colormaps : cmap="YlGnBu" , cmap="Greens", cmap="Blues",  cmap="Reds"
sns.heatmap(cm_pd, annot=True, fmt=',', cmap="Blues")

### 7.3 - AUC

For scewed data 'Area Under Precision Recall' curve might be better indicator.  
https://stats.stackexchange.com/questions/90779/area-under-the-roc-curve-or-area-under-the-pr-curve-for-imbalanced-data

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# default metrics for BinaryClassificationEvaluator is 'areaUnderCurve'
evaluator1 = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName='areaUnderROC')

print("AUC for training: " , evaluator1.evaluate(predictions_train))
print ("AUC for test : " , evaluator1.evaluate(predictions_test))

evaluator2 = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName='areaUnderPR')
print("Area under PR for training: " , evaluator2.evaluate(predictions_train))
print ("Area under PR for test : " , evaluator2.evaluate(predictions_test))


## Step 8: Improve Accuracy

### Add more data
In Step-1 change the 'datafile' to the full dataset.  
And see how the accuracy above changes

### Add more features
Look at the schema of the full dataset.  Are there any columns you want to add