# Random Forest : Prosper Loan Dataset

A decision tree a learned set of rules that allows us to make decisions on data.

We are going to look at the prosper loan dataset.  This dataset shows a history of loans made by Prosper.

In [None]:
%matplotlib inline
import time
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

## Step 1: Load the Data

In [None]:
## small file, start with this
datafile = "/data/prosper-loan/prosper-loan-data-simplified.csv"
## this is a large file
#datafile = "/data/prosper-loan/prosper-loan-data.csv.gz"

t1 = time.perf_counter()
data = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv(datafile)
t2 = time.perf_counter()

print("read {:,} records in {:,.2f} ms".format(data.count(), (t2-t1)*1000))
# schema
data.printSchema()

In [None]:
## select a few columns 
## start with: 'LoanStatus',  'EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome'
## we add more later

select_columns = ['LoanStatus', 'EmploymentStatus', 'CreditScore', 'StatedMonthlyIncome', 'ListingCategory']

## Note : vector columns can only have Numbers, don't include Categorical columns here
## And dfefinitely not 'LoanStatus'  (if you are curiuos include and see what happens!)
vector_columns = [ 'EmpIndex', 'CreditScore', 'StatedMonthlyIncome', 'CategoryIndex']



In [None]:
## Display

prosper = data.select(select_columns)  
prosper.printSchema()
prosper.show()
print (prosper.count())

## Step 2 : Clean Data

In [None]:
# Drop any NA values.  Using `dataframe.na.drop()`
prosper_clean = prosper.na.drop()
print("Original record count {:,}, cleaned records count {:,},  dropped {:,}"\
      .format(prosper.count(), prosper_clean.count(), (prosper.count() - prosper_clean.count())))
prosper_clean.show()


## Look at some summary data

In [None]:
prosper_clean.groupBy('LoanStatus').count().show()
prosper_clean.groupBy('EmploymentStatus').count().show()
prosper_clean.groupBy('ListingCategory').count().show(60)

**=> What does that say about the cardinality of these categorical columns? ***



## Step 3: Converting Categorical columns 

Convert categorical columns to numeric.   
Here let's convert **EmploymentStatus** column

In [None]:
strIndexer_employment = StringIndexer(inputCol="EmploymentStatus", outputCol="EmpIndex")
prosper_indexed = strIndexer_employment.fit(prosper_clean).transform(prosper_clean)

strIndexer_category = StringIndexer(inputCol="ListingCategory", outputCol="CategoryIndex")
prosper_indexed = strIndexer_category.fit(prosper_indexed).transform(prosper_indexed)

prosper_indexed.show()


## Step 4: Build feature vectors using VectorAssembler.

In [None]:
assembler = VectorAssembler(inputCols=vector_columns, outputCol="features")
feature_vector = assembler.transform(prosper_indexed)
feature_vector = feature_vector.withColumn("label", feature_vector["LoanStatus"])
feature_vector.show(10, False)

## Step 5: Split Data into training and test.

We will split our the data up into training and test.  (You know the drill by now).

**=> TODO: Split dataset into 70% training, 30% validation**


In [None]:

# Split the data into training and test sets (30% held out for testing)
(training, test) =  feature_vector.randomSplit([.7,.3])
print("training set = " , training.count())
print("testing set = " , test.count())

## Step 6: Random Forest

In [None]:
## TODO : Create a RandomForest with numTrees=20  and maxBins=10000

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=???, maxBins=???)
t1 = time.perf_counter()
rf_model = rf.fit(training)
t2 = time.perf_counter()

## TODO : NOtice the time it took for training
## Is it more or less than decision trees?
print("traind on {:,} records using {:,} features in {:,.2f} ms".\
      format(training.count(), len(vector_columns), (t2-t1)*1000))

rf_model

** Q : How many nodes the tree has? **

In [None]:
predictions = rf_model.transform(test)

predictions2= predictions.drop('rawPrediction', 'probability')
predictions2.show()


## Step 7: Evaluate the model.

Let us check to see how the model did, using accuracy as a measure.

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy ",  accuracy)


## Step 8: Improve Accuracy

### Add more data
In Step-1 change the 'datafile' to the full dataset.  
And see how the accuracy above changes

### Add more features
Look at the schema of the full dataset.  Are there any columns you want to add