# Linear Regression using Pyspark

In [3]:
from pathlib import Path
home = "dbfs:/mnt/data"

In [4]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [5]:
df = spark.read.parquet(f"{home}/data/10-processed-data.parquet")


In [6]:
df.columns

['Client_Education',
 'Employed_Days',
 'Age_Days',
 'Client_Income_Type',
 'Client_Gender',
 'Car_Owned',
 'ID_Days',
 'Score_Source_2',
 'Phone_Change',
 'Default',
 'Client_Education_index',
 'Client_Income_Type_index',
 'Client_Gender_index',
 'Client_Education_index_OHE',
 'Client_Income_Type_index_OHE',
 'Client_Gender_index_OHE',
 'features',
 'features_scaled']

In [7]:
df = df.selectExpr("features_scaled as features","Default")

In [8]:
df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|features                                                                                                                                                                           |Default|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|[0.0029076532609796767,0.35840228245363764,0.0,0.053216618035292484,0.0047878169393908475,0.015053763440860216,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0]|0.0    |
|[0.011304802556106483,0.37009985734664763,1.0,0.0029178824510212586,0.002150633517531676,0.22988627268703973,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0]  |0.0    |
|[0.013968782427041722,0.5200570613409415,0.0,0.04

In [9]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=df.randomSplit([0.7,0.3])

## Build Linear Regression Model 

In [10]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='Default')

In [11]:
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [12]:
lr_model.intercept

0.0

In [13]:
print(lr_model.coefficients)

[0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]


In [14]:
train_df

DataFrame[features: vector, Default: double]

In [15]:
training_predictions=lr_model.evaluate(train_df)

In [16]:
training_predictions.meanSquaredError

0.0

In [18]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [19]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+---------+
|residuals|
+---------+
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
+---------+
only showing top 10 rows

