# Decision Tree Learning: Prosper Loan Dataset

Let us construct a Decision Tree using MLSpark.

Let us look at a Prosper dataset, which is described in the slides.

In [139]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [191]:
dataset = spark.read.csv("../../datasets/prosper-loan/prosper-loan-data.csv.gz", header=True, inferSchema=True)


In [192]:
dataset.show(20)

+----+----------+------------+-----------------------+------------+---------------+-------------+----------------+------------------------+-------------------+-----------+------------------+---------------+--------------------------+---------------------+---------------------------+--------------------+--------------+--------------------+----------------+-----------------------+------------------------+-------------------------+----------------------+-------------------+-----------------------+-----------+----------------------------------+-----------------------+-----------------+----------------+-------------------+-----------------+--------------------------+---------------------+-----------------------------------+-------------------------------+------------------------+---------------------------+------------------+------------------+---------------+--------------------------+---------------------------+---------+---------------+
|Term|LoanStatus|BorrowerRate|ProsperRating (numeri

In [221]:


columns = ['Term', 'BorrowerRate', 'ProsperRating (numeric)', 'ProsperScore', 'EmploymentStatusDuration', 'IsBorrowerHomeowner',
           'CreditScore', 'CurrentCreditLines', 'OpenCreditLines',
           'TotalCreditLinespast7years', 'OpenRevolvingAccounts', 'OpenRevolvingMonthlyPayment',
           'InquiriesLast6Months', 'TotalInquiries', 'CurrentDelinquencies', 'AmountDelinquent',
           'DelinquenciesLast7Years', 'PublicRecordsLast10Years', 'PublicRecordsLast12Months',
           'RevolvingCreditBalance', 'BankcardUtilization', 'AvailableBankcardCredit', 'TotalTrades',
           'TradesNeverDelinquent (percentage)', 'TradesOpenedLast6Months', 'DebtToIncomeRatio',
           'IncomeVerifiable', 'StatedMonthlyIncome', 'TotalProsperLoans', 'TotalProsperPaymentsBilled',
           'OnTimeProsperPayments', 'ProsperPaymentsLessThanOneMonthLate', 'ProsperPaymentsOneMonthPlusLate',
           'ProsperPrincipalBorrowed', 'ProsperPrincipalOutstanding', 'LoanOriginalAmount',
           'MonthlyLoanPayment', 'Recommendations', 'InvestmentFromFriendsCount', 'InvestmentFromFriendsAmount',
           'Investors', 'YearsWithCredit']

#categorical_columns = ["BorrowerState", "EmploymentStatus", "ListingCategory"]

#categorical_indexers = ["BorrowerState_index", "EmploymentStatus_index", "ListingCategory_index"]


categorical_columns = ["EmploymentStatus", "ListingCategory"]
categorical_indexers = ["EmploymentStatus_index", "ListingCategory_index"]
boolean_columns = ["IsBorrowerHomeowner","CurrentlyInGroup", 'IncomeVerifiable']



In [208]:
dataset.select(categorical_columns).show(1)

+----------------+---------------+
|EmploymentStatus|ListingCategory|
+----------------+---------------+
|   Self-employed|        Unknown|
+----------------+---------------+
only showing top 1 row



In [209]:
dataset.select(columns).show(10)

+----+------------+-----------------------+------------+------------------------+-------------------+-----------+------------------+---------------+--------------------------+---------------------+---------------------------+--------------------+--------------+--------------------+----------------+-----------------------+------------------------+-------------------------+----------------------+-------------------+-----------------------+-----------+----------------------------------+-----------------------+-----------------+----------------+-------------------+-----------------+--------------------------+---------------------+-----------------------------------+-------------------------------+------------------------+---------------------------+------------------+------------------+---------------+--------------------------+---------------------------+---------+---------------+
|Term|BorrowerRate|ProsperRating (numeric)|ProsperScore|EmploymentStatusDuration|IsBorrowerHomeowner|CreditSc

In [210]:
dataset.show()

+----+----------+------------+-----------------------+------------+---------------+-------------+----------------+------------------------+-------------------+-----------+------------------+---------------+--------------------------+---------------------+---------------------------+--------------------+--------------+--------------------+----------------+-----------------------+------------------------+-------------------------+----------------------+-------------------+-----------------------+-----------+----------------------------------+-----------------------+-----------------+----------------+-------------------+-----------------+--------------------------+---------------------+-----------------------------------+-------------------------------+------------------------+---------------------------+------------------+------------------+---------------+--------------------------+---------------------------+---------+---------------+
|Term|LoanStatus|BorrowerRate|ProsperRating (numeri

In [228]:
dataset.groupBy('LoanStatus').count().show()

+----------+-----+
|LoanStatus|count|
+----------+-----+
|         1|33530|
|         0|16194|
+----------+-----+



In [229]:
dataset.groupBy('EmploymentStatus').count().show()

+----------------+-----+
|EmploymentStatus|count|
+----------------+-----+
|        Employed|18393|
|       Part-time| 1060|
|   Self-employed| 3045|
|    Not employed|  583|
|           Other|  924|
|       Full-time|25016|
|         Retired|  703|
+----------------+-----+



In [232]:
dataset.groupBy('ListingCategory').count().show(60)

+---------------+-----+
|ListingCategory|count|
+---------------+-----+
|        Student|  756|
|       Personal| 2392|
|     Motorcycle|  103|
|  LargePurchase|  224|
|           Baby|   46|
|      Household|  675|
|        Unknown| 9335|
|          Other| 6272|
|           Auto| 1596|
|          Green|   23|
|           Debt|19107|
|           Reno| 3468|
|       Cosmetic|   47|
|       Business| 4449|
|     Engagement|   72|
|       Vacation|  225|
|          Taxes|  246|
|           Boat|   30|
|             RV|   18|
|        Medical|  444|
|        Wedding|  196|
+---------------+-----+



In [233]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(dataset) for column in categorical_columns ]


pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(dataset).transform(dataset)

df_r.show()

+----+----------+------------+-----------------------+------------+---------------+-------------+----------------+------------------------+-------------------+-----------+------------------+---------------+--------------------------+---------------------+---------------------------+--------------------+--------------+--------------------+----------------+-----------------------+------------------------+-------------------------+----------------------+-------------------+-----------------------+-----------+----------------------------------+-----------------------+-----------------+----------------+-------------------+-----------------+--------------------------+---------------------+-----------------------------------+-------------------------------+------------------------+---------------------------+------------------+------------------+---------------+--------------------------+---------------------------+---------+---------------+----------------------+---------------------+
|Term|

In [234]:
na_dropped = df_r.select(columns + categorical_indexers + ['LoanStatus']).na.drop()
na_dropped.show()

+----+------------+-----------------------+------------+------------------------+-------------------+-----------+------------------+---------------+--------------------------+---------------------+---------------------------+--------------------+--------------+--------------------+----------------+-----------------------+------------------------+-------------------------+----------------------+-------------------+-----------------------+-----------+----------------------------------+-----------------------+-----------------+----------------+-------------------+-----------------+--------------------------+---------------------+-----------------------------------+-------------------------------+------------------------+---------------------------+------------------+------------------+---------------+--------------------------+---------------------------+---------+---------------+----------------------+---------------------+----------+
|Term|BorrowerRate|ProsperRating (numeric)|ProsperSco

In [235]:
assembler = VectorAssembler(inputCols=columns + categorical_indexers, outputCol="features")
fv = assembler.transform(na_dropped)

In [236]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="LoanStatus", outputCol="indexedLabel").fit(fv)


In [238]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(fv)


In [240]:

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = fv.randomSplit([0.7, 0.3])


In [243]:

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])



In [244]:
# Train model.  This also runs the indexers.

model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)



+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(44,[0,1,2,3,4,6,...|
|       0.0|         0.0|(44,[0,1,2,3,4,5,...|
|       0.0|         0.0|(44,[0,1,2,3,4,6,...|
|       0.0|         0.0|[12.0,0.0499,7.0,...|
|       0.0|         0.0|(44,[0,1,2,3,4,6,...|
+----------+------------+--------------------+
only showing top 5 rows



In [245]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.322478 


In [246]:

treeModel = model.stages[2]
# summary only
print(treeModel)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ae7a37b1f768448bee5) of depth 5 with 63 nodes
