# Preliminary Steps

In [None]:
!pip install pyspark

In [None]:
## Import necessary packages
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import seaborn as sns
from matplotlib import pyplot as plt
from pyspark.ml.regression import LinearRegression 
from pyspark.sql.functions import *
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer

In [None]:
## Assigne spark to it's function 
spark = SparkSession.builder.appName('abc').getOrCreate()

In [None]:
## Load csv data. It contains header
loans = spark.read.format("csv") \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .load("LoanStats.csv")

In [None]:
## First look at the data - check mainly the categories
print(display(loans))
print(loans.head(1))

In [None]:
loans.toPandas().head(5)

# Data Understanding


- **int_rate**: Dependent Variable  - Interest Rate on the loan

- **loan_amnt**: Independent Variable - The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.

- **term**: Independent Variable - The number of payments on the loan. Values are in months and can be either 36 or 60.
- **grade**: Independent Variable - LC assigned loan grade
- **home_ownership**: Independent Variable The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
- **annual_inc**: Independent Variable - The self-reported annual income provided by the borrower during registration.
- **purpose**: Independent Variable - A category provided by the borrower for the loan request.

In [None]:
loans.printSchema()

In [None]:
loans.describe().toPandas()

In [None]:
loans.toPandas().describe(include=['object','category']).transpose()

In [None]:
## Frequency tables:

print(sns.countplot(y="grade", data=loans.toPandas(), color="c"))

In [None]:
print(sns.countplot(y="home_ownership", data=loans.toPandas(), color="c"))

In [None]:
print(sns.countplot(y="purpose", data=loans.toPandas(), color="c"))

In [None]:
## Correlation Matrix - check the correlation between the numerical variables
loans.toPandas().corr()

In [None]:
## Next, we can check the distribution of the numerical variables and how they are related to the categorical ones
sns.pairplot(loans.toPandas(), hue = "home_ownership")

In [None]:
sns.pairplot(loans.toPandas(), hue = "purpose")

In [None]:
sns.pairplot(loans.toPandas(), hue = "grade")

# Data Preparation

In [None]:
# Register for Spark SQL
loans.createOrReplaceTempView("loans")

In [None]:
loans.toPandas().isna().sum().sum()

In [None]:
display(loans)

In [None]:
## The variables loan amount and annual income are not normal distributed, so we apply a log transformation to move them towards normality
loans = loans.withColumn("loan_amnt_ln", log(loans["loan_amnt"]) )

In [None]:
loans = loans.withColumn("annual_inc_ln", log(loans["annual_inc"]) )

In [None]:
sns.pairplot(loans['annual_inc_ln', 'loan_amnt_ln'].toPandas())

In [None]:
loans.toPandas().head(5)

In [None]:
# Convert a categorical variable (string or object) into a numerical one
strInd_g = StringIndexer(inputCol="grade", outputCol = "grade_num")
strInd_o = StringIndexer(inputCol="home_ownership", outputCol = "home_ownership_num") 
strInd_p = StringIndexer(inputCol="purpose", outputCol = "purpose_num") 

In [None]:
strInd_gl = strInd_g.fit(loans)
strInd_ol = strInd_o.fit(loans)
strInd_pl = strInd_p.fit(loans)

In [None]:
loans = strInd_gl.transform(loans)
loans = strInd_ol.transform(loans)
loans = strInd_pl.transform(loans)

In [None]:
loans.toPandas().head(5)

In [None]:
encoderInd_gl = OneHotEncoder(inputCol = "grade_num", outputCol= "grade_cat")
encoderInd_ol = OneHotEncoder(inputCol = "home_ownership_num", outputCol= "home_ownership_cat")
encoderInd_pl = OneHotEncoder(inputCol = "purpose_num", outputCol= "purpose_cat")

In [None]:
encoderInd_gl_loans_fit = encoderInd_gl.fit(loans)
encoderInd_ol_loans_fit = encoderInd_ol.fit(loans)
encoderInd_pl_loans_fit = encoderInd_pl.fit(loans)

In [None]:
loans = encoderInd_gl_loans_fit.transform(loans)
loans = encoderInd_ol_loans_fit.transform(loans)
loans = encoderInd_pl_loans_fit.transform(loans)

In [None]:
loans.toPandas().head(5)

In [None]:
loans.toPandas().corr()

In [None]:
## Specification of the Vector Assembler: features
vectorAssembler = VectorAssembler(inputCols = ['loan_amnt_ln', 'term', 'annual_inc_ln', 'grade_cat'], 
                                  outputCol = 'features')

In [None]:
loans_v = vectorAssembler.transform(loans)

In [None]:
## Reduce the DataFrame
loans_v = loans_v.select(['features','int_rate'])

In [None]:
loans_v.toPandas().head()

# Modeling

In [None]:
## Split the data
training_data, test_data = loans_v.randomSplit(weights=[.7,.3], seed=10) 

In [None]:
training_data.toPandas().head(3)

In [None]:
## Initialize the linear regression - Features: Independent Variable, Label: Target variable, maxIter Maximum Number of Iterations
lr = LinearRegression(featuresCol = 'features', 
                          labelCol = 'int_rate', 
                          maxIter = 20)

In [None]:
## Fit the model 
linearModel = lr.fit(training_data)
print("Slope: ", str(linearModel.coefficients))
print("Intercept: ", str(linearModel.intercept))

# Model Evaluation

In [None]:
print("R²:", (linearModel.summary.r2))

In [None]:
print("R²:", (linearModel.summary.r2adj))

In [None]:
target.describe().show()

In [None]:
linearModel.summary.residuals.show()

In [None]:
sns.distplot(linearModel.summary.residuals.toPandas())

In [None]:
trainingSummary = linearModel.summary
print("RMSEA: %f" % trainingSummary.rootMeanSquaredError)
print("R²:%f" % trainingSummary.r2)

In [None]:
prediction = linearModel.transform(test_data)

In [None]:
prediction.head()

In [None]:
RegressionEvaluator(labelCol='int_rate').evaluate(prediction)