# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the dataset

In [2]:
dataset=pd.read_csv('excel/credit_card.csv')

In [3]:
dataset.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


In [57]:
dataset.describe()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,ZipCode,Income,Approved
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,0.695652,31.514116,4.758725,0.76087,0.763768,2.223406,0.523188,0.427536,2.4,0.457971,180.547826,1017.385507,0.444928
std,0.460464,11.860245,4.978163,0.426862,0.425074,3.346513,0.499824,0.49508,4.86294,0.498592,173.970323,5210.102598,0.497318
min,0.0,13.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,22.67,1.0,1.0,1.0,0.165,0.0,0.0,0.0,0.0,60.0,0.0,0.0
50%,1.0,28.46,2.75,1.0,1.0,1.0,1.0,0.0,0.0,0.0,160.0,5.0,0.0
75%,1.0,37.7075,7.2075,1.0,1.0,2.625,1.0,1.0,3.0,1.0,272.0,395.5,1.0
max,1.0,80.25,28.0,1.0,1.0,28.5,1.0,1.0,67.0,1.0,2000.0,100000.0,1.0


#### We can see some different things about the data
#### Most of the people fall between 25 and 35 years old
#### Most people's debt falls between 2 and 5
#### Most people have a prior default on their credit cards
#### The majority does not get approved

## Clean up the data

In [5]:
dataset.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
Industry          0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

### There are no missing values

### Need to drop columns that are not necessary

In [7]:
dataset.columns

Index(['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'Industry',
       'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
       'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'Approved'],
      dtype='object')

In [10]:
dataset.drop(['YearsEmployed', 'CreditScore', 'DriversLicense', 'Citizen', 'ZipCode'], axis=1)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,PriorDefault,Employed,Income,Approved
0,1,30.83,0.000,1,1,Industrials,White,1,1,0,1
1,0,58.67,4.460,1,1,Materials,Black,1,1,560,1
2,0,24.50,0.500,1,1,Materials,Black,1,0,824,1
3,1,27.83,1.540,1,1,Industrials,White,1,1,3,1
4,1,20.17,5.625,1,1,Industrials,White,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,0,0,Education,Black,0,0,0,0
686,0,22.67,0.750,1,1,Energy,White,0,1,394,0
687,0,25.25,13.500,0,0,Healthcare,Latino,0,1,1,0
688,1,17.92,0.205,1,1,ConsumerStaples,White,0,0,750,0


## Split the dataset into independent and Dependent variables

In [12]:
dataset.columns

Index(['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'Industry',
       'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
       'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'Approved'],
      dtype='object')

In [13]:
X=dataset[['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'Industry',
       'Ethnicity','PriorDefault','Income']]
y=dataset[['Approved']]

## Handling Categorical Variables

In [14]:
X=pd.get_dummies(dataset[['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'Industry',
       'Ethnicity','PriorDefault','Income']])

In [15]:
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,PriorDefault,Income,Industry_CommunicationServices,Industry_ConsumerDiscretionary,Industry_ConsumerStaples,...,Industry_Materials,Industry_Real Estate,Industry_Research,Industry_Transport,Industry_Utilities,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White
0,1,30.83,0.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,58.67,4.46,1,1,1,560,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0,24.5,0.5,1,1,1,824,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,27.83,1.54,1,1,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,20.17,5.625,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Experiement 1

## Splitting the dataset into the Training set, Test set and validation set

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_rest, y_train, y_rest=train_test_split(X,y, test_size=.1, random_state=42)

#Using a testing size of .1

In [27]:
X_test, X_val, y_test, y_val=train_test_split(X_rest, y_rest, test_size=.1, random_state=42)

#Using a testing size of .1

## Training the Multiple Linear Regression model on the Training set

In [28]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Validation set

In [29]:
y_predval=regressor.predict(X_val.values)

In [30]:
y_validate=y_val.to_numpy()

In [31]:
np.hstack((y_predval,y_validate)).round()

array([[-0.,  0.],
       [-0.,  0.],
       [-0.,  0.],
       [-0.,  1.],
       [ 0.,  1.],
       [ 0.,  0.],
       [ 0.,  0.]])

## Predicting the Test set results

In [32]:
y_pred=regressor.predict(X_test.values)

### RMSE and R Square

In [33]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [34]:
print(f"r-square: {r2_score(y_test,y_pred): .2f}")
print(f"MSE: {mean_squared_error(y_test,y_pred): .2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)): .2f}")

r-square:  0.48
MSE:  0.13
RMSE:  0.36


# Experiment 2

## Splitting the dataset into the Training set, Test set and validation set

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_rest, y_train, y_rest=train_test_split(X,y, test_size=.15, random_state=42)

#Using a testing size of .15

In [36]:
X_test, X_val, y_test, y_val=train_test_split(X_rest, y_rest, test_size=.15, random_state=42)

#Using a testing size of .15

## Training the Multiple Linear Regression model on the Training set

In [37]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Validation Set

In [38]:
y_predval=regressor.predict(X_val.values)

In [39]:
y_validate=y_val.to_numpy()

In [40]:
np.hstack((y_predval,y_validate)).round()

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  1.],
       [ 1.,  0.],
       [ 1.,  1.],
       [-0.,  0.],
       [ 0.,  0.],
       [-0.,  0.],
       [ 0.,  0.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [-0.,  1.]])

## Predicting the Test set results

In [41]:
y_pred=regressor.predict(X_test.values)

### RMSE and R Square

In [42]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [43]:
print(f"r-square: {r2_score(y_test,y_pred): .2f}")
print(f"MSE: {mean_squared_error(y_test,y_pred): .2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)): .2f}")

r-square:  0.46
MSE:  0.13
RMSE:  0.37


#### The R-square decreased slightly along with the MSE and RMSE due to the change in split ratio

# Experiment 3

## Splitting the dataset into the Training set, Test set and validation set

In [58]:
from sklearn.model_selection import train_test_split
X_train,X_rest, y_train, y_rest=train_test_split(X,y, test_size=.2, random_state=42)

#Using a testing size of .2

In [59]:
X_test, X_val, y_test, y_val=train_test_split(X_rest, y_rest, test_size=.2, random_state=42)

#Using a testing size of .2

## Training the Multiple Linear Regression model on the Training set

In [60]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Validation Set

In [61]:
y_predval=regressor.predict(X_val.values)

In [62]:
y_validate=y_val.to_numpy()

In [63]:
np.hstack((y_predval,y_validate)).round()

array([[ 1.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  1.],
       [ 0.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 0.,  0.],
       [ 1.,  0.],
       [ 0.,  0.],
       [ 1.,  1.],
       [ 0.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 0.,  0.],
       [ 1.,  1.],
       [ 0.,  0.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [-0.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 2.,  1.],
       [ 1.,  1.],
       [-0.,  0.],
       [ 0.,  0.]])

## Predicting the Test set results

In [64]:
y_pred=regressor.predict(X_test.values)

### RMSE and R Square

In [65]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [66]:
print(f"r-square: {r2_score(y_test,y_pred): .2f}")
print(f"MSE: {mean_squared_error(y_test,y_pred): .2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)): .2f}")

r-square:  0.28
MSE:  0.18
RMSE:  0.42


#### The r-square, MSE, and RMSE all decreased significantly due to the change in split ratio