In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

In [6]:
# reading the data:
filename = "data/bank-full.csv"

In [7]:
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')

In [11]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [17]:
# assembling individual columns to one column - 'features'
def assemble_vector(df, features_list, target_variable_name):
    stages = []
    
    # assemble vectors:
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    
    stages=[assembler]
    
    # selecting all columns
    selected_cols = [target_variable_name, 'features'] + features_list
    
    # creating pipeline
    pipeline = Pipeline(stages=stages)
    
    assembleModel = pipeline.fit(df)
    df = assembleModel.transform(df).select(selected_cols)
    
    return df

In [12]:
# select the variables:
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
target_variable = 'balance'

In [13]:
# exclude target variable and select all the other features:
features_list = linear_df.columns
features_list.remove(target_variable)

In [14]:
features_list

['age', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
# applying the vector assembler on our dataframe:
df = assemble_vector(linear_df, features_list, target_variable)

In [19]:
df.printSchema()

root
 |-- balance: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)



In [25]:
# creating the linear regression model:
linreg = LinearRegression(featuresCol='features', labelCol='balance')

In [26]:
# fitting the model:
linreg_model = linreg.fit(df)

In [27]:
# viewing the coefficients and intercepts:
for k, v in df.schema['features'].metadata['ml_attr']['attrs'].items():
    features_df = pd.DataFrame(v)

In [29]:
# print the coefficient and intercepts:
print(f"Coefficients: \n{linreg_model.coefficients}")
print(f"Intercept: {linreg_model.intercept}")

Coefficients: 
[28.08397290892997,3.3055463619496286,0.24882841970901756,-14.142676297161454,-0.08248810233032043,23.462992800762525]
Intercept: 124.92130092818479


In [31]:
features_df['coefficients'] = linreg_model.coefficients

In [32]:
# looking at the coefficients:
features_df

Unnamed: 0,idx,name,coefficients
0,0,age,28.083973
1,1,day,3.305546
2,2,duration,0.248828
3,3,campaign,-14.142676
4,4,pdays,-0.082488
5,5,previous,23.462993


In [33]:
# prediction result:
pred_result = linreg_model.transform(df)

In [35]:
pred_result.show(5)

+-------+--------------------+---+---+--------+--------+-----+--------+------------------+
|balance|            features|age|day|duration|campaign|pdays|previous|        prediction|
+-------+--------------------+---+---+--------+--------+-----+--------+------------------+
|   2143|[58.0,5.0,261.0,1...| 58|  5|     261|       1|   -1|       0|1821.2034908050935|
|     29|[44.0,5.0,151.0,1...| 44|  5|     151|       1|   -1|       0| 1400.656743912082|
|      2|[33.0,5.0,76.0,1....| 33|  5|      76|       1|   -1|       0| 1073.070910435676|
|   1506|[47.0,5.0,92.0,1....| 47|  5|      92|       1|   -1|       0|  1470.22778587604|
|      1|[33.0,5.0,198.0,1...| 33|  5|     198|       1|   -1|       0|1103.4279776401763|
+-------+--------------------+---+---+--------+--------+-----+--------+------------------+
only showing top 5 rows



#### Multicollinearity <br>
- Multicollinearity happens when independent variables in the regression model are highly correlated to each other.
- It makes the interpretation of the model difficult.
- It can also create an overfitting problem.

##### We will use VIF - Variance Inflation Factor.
- The industry standard value for VIF is 10.
- When the VIF value is 1, the variables are completely uncorrelated.
- In our final model, we should only include the variables with VIF value less than 10.

In [36]:
# function to calculate the VIF:
def calculate_vif(df, features_list):
    vif_list = []
    for i in features_list:
        temp_features_list = features_list.copy()
        temp_features_list.remove(i)  # removing one feature 
        target_feature = i  # creating teh removed feature as target variable
        
        assembler = VectorAssembler(inputCols=temp_features_list, outputCol='features')
        
        temp_df = assembler.transform(df)
        reg = LinearRegression(featuresCol='features', labelCol=i)
        reg_model = reg.fit(temp_df)
        
        # calculating the vif:
        temp_vif = 1 / (1 - reg_model.summary.r2)
        vif_list.append(temp_vif)
    return vif_list

In [37]:
# appending the vif in the features_df
features_df['vif'] = calculate_vif(linear_df, features_list)

In [38]:
# looking at the features_df:
features_df

Unnamed: 0,idx,name,coefficients,vif
0,0,age,28.083973,1.000917
1,1,day,3.305546,1.03435
2,2,duration,0.248828,1.007627
3,3,campaign,-14.142676,1.039907
4,4,pdays,-0.082488,1.276182
5,5,previous,23.462993,1.261321


*Since all the VIF values for all the features are less than 10, therefore, there are no multicollinearity issues in our model features*

In [39]:
# saving the model:
linreg_model

LinearRegressionModel: uid=LinearRegression_c61aff27dbfb, numFeatures=6