# RIDGE REGRESSION LINEAR REGRESSION

The following notebook is my code for my linear regression with ridge regression of the concrete data. First, I imported all necessary packages.

In [69]:
# importing pandas
import pandas as pd
#importing numpy
import numpy as np
#importing matplotlib
import matplotlib.pyplot as plt
#importing seaborn
import seaborn as sb
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling

This is to confirm the data is in the correct directory.

In [70]:
from os.path import exists
data_dir = 'data'
#data_dir = 'Documents/Users/kobo/514-assignment-1/data'
assert exists(f'{data_dir}/Concrete_Data.csv'), 'concrete data file is missing.'
data = pd.read_csv(f'{data_dir}/Concrete_Data.csv')

The following reads the data file and prints out all the feature attributes in the dataset.

In [71]:
data.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

The following cell replaces any values that are missing or nan with the mean of that feature. 

In [36]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 
imputer = imputer.fit(data.iloc[:, 1:])
data.iloc[:, 1:] = imputer.transform(data.iloc[:, 1:])

The following cell separates the dataset into its input features (X) and its output feature (Y). In addition, I standardized all features by subtracting each feature's value from the feature's mean and then dividing that by the feature's standard deviation. When running my code for the univariate ridge regression models, I would just switch our the feature name from X and run the rest of my code from there. 

In [72]:
X=data[['Cement (component 1)(kg in a m^3 mixture)']]
Y=data['Concrete compressive strength(MPa, megapascals) ']
py=np.array((Y-Y.mean())/Y.std())
pX=X.apply(lambda rec:(rec-rec.mean())/rec.std(),axis=0)

The following cell shows the standardization of my feature values. 

In [73]:
pX

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture)
0,2.476712
1,2.476712
2,0.491187
3,0.491187
4,-0.790075
...,...
1025,-0.045623
1026,0.392628
1027,-1.269472
1028,-1.168042


The following cells splits the dataset into its training and testing sets for BOTH the processed and raw data. This way we can compare the two later on.

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.126, random_state=4)

In [75]:
pX_train, pX_test, py_train, py_test = train_test_split(pX, py, test_size=0.126, random_state=4)

In [76]:
X_test

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture)
522,284.0
701,288.0
563,210.7
678,288.0
98,475.0
...,...
157,362.6
910,144.0
102,531.3
170,388.6


The following cells define the cost function and gradient descent for my ridge regression. 

In [78]:
class ridgeRegression():
    def initialization(self, learning_rate, iterations, l2_penality) :
        self.learning_rate = learning_rate        
        self.iterations = iterations        
        self.l2_penality = l2_penality
    def fit(self, X, Y):     
        self.m, self.n = X.shape     
        self.W = np.zeros(self.n)
        self.b = 0        
        self.X = X        
        self.Y = Y                
        for i in range(self.iterations):            
            self.update_weights()            
        return self 
    def update_weights(self):           
        Y_pred = self.predict(self.X)  
        dw = (-(2*(self.X.T).dot(self.Y - Y_pred)) + (2*self.l2_penality*self.W))/self.m     
        db = - 2*np.sum(self.Y - Y_pred)/self.m    
        self.W = self.W - self.learning_rate*dw    
        self.b = self.b - self.learning_rate*db        
        return self
    def predict(self,X):    
        return X.dot(self.W) + self.b

In [82]:
pX_train_model = RidgeRegression(iterations = 1000, learning_rate = 0.001, l2_penality = 1)
pX_test_model = RidgeRegression(iterations = 1000, learning_rate = 0.001, l2_penality = 1)
X_train_model = RidgeRegression(iterations = 1000, learning_rate = 0.001, l2_penality = 1)
X_test_model = RidgeRegression(iterations = 1000, learning_rate = 0.001, l2_penality = 1)
pX_train_model.fit(pX_train, py_train)
pX_test_model.fit(pX_test, py_test)
X_train_model.fit(X_train, y_train)
X_test_model.fit(X_test, y_test)
      
pX_train_pred = model.predict(pX_train)    
print( "Pre-Processed Trained Predicted values ", pX_train_pred[:3] )     
print( "Pre-Processed Trained Real values      ", py_train[:3] )    
print( "Pre-Processed Trained Trained W        ", pX_train_model.W[0], 2 )    
print( "Pre-Processed Trained Trained b        ", pX_train_model.b, 2)       

pX_test_pred = model.predict(pX_test)    
print( "Pre-Processed Test Predicted values ", pX_test_pred[:3] )     
print( "Pre-Processed Test Real values      ", py_test[:3] )    
print( "Pre-Processed Test Trained W        ", pX_test_model.W[0], 2 )    
print( "Pre-Processed Test Trained b        ", pX_test_model.b, 2) 

X_train_pred = model.predict(X_train)    
print( "Raw Trained Predicted values ", X_train_pred[:3] )     
print( "Raw Trained Real values      ", y_train[:3] )    
print( "Raw Trained Trained W        ", X_train_model.W[0], 2 )    
print( "Raw Trained Trained b        ", X_train_model.b, 2)   

X_test_pred = model.predict(X_test)    
print( "Raw Test Predicted values ", X_test_pred[:3] )     
print( "Raw Test Real values      ", y_test[:3] )    
print( "Raw Test Trained W        ", X_test_model.W[0], 2 )    
print( "Raw Test Trained b        ", X_test_model.b, 2)   
      

Pre-Processed Trained Predicted values  936   -0.204849
611   -0.013439
766    0.502078
dtype: float64
Pre-Processed Trained Real values       [-0.4302689  -0.87143458 -0.47276925]
Pre-Processed Trained Trained W         0.43331667965248877 2
Pre-Processed Trained Trained b         0.004342333787404326 2
Pre-Processed Test Predicted values  522    0.019974
701    0.039067
563   -0.329910
dtype: float64
Pre-Processed Test Real values       [ 0.52090107  0.88065761 -0.83791317]
Pre-Processed Test Trained W         0.4052481279557295 2
Pre-Processed Test Trained b         -0.028528753727753296 2
Raw Trained Predicted values  936    118.181855
611    138.185373
766    192.060186
dtype: float64
Raw Trained Real values       936    28.63
611    21.26
766    27.92
Name: Concrete compressive strength(MPa, megapascals) , dtype: float64
Raw Trained Trained W         nan 2
Raw Trained Trained b         -inf 2
Raw Test Predicted values  522    141.677259
701    143.672623
563    105.112224
dtype: 

The following cell is the variance explained function. 

In [83]:
def r2(X, y, y_pred):
    SSE = sum((y-y_pred)**2) # Sum of squared error
    SST = sum((y-np.mean(y))**2) # Sum of squared total
    n = len(X)
    MSE = SSE/n # Mean Squared Error
    R_squared = 1-(SSE/SST) # R Square
    return R_squared

The printed out variance explained values are in the order of (processed trained split, processed test split, raw train split, raw test split)

In [84]:
pX_train_r2 = r2(pX_train, py_train, pX_train_pred)
pX_test_r2 = r2(pX_test, py_test, pX_test_pred)
X_train_r2 = r2(X_train, y_train, X_train_pred)
X_test_r2 = r2(X_test, y_test, X_test_pred)
pX_train_r2, pX_test_r2, X_train_r2, X_test_r2

(0.25486999506420316,
 0.20157944853970822,
 -45.81717082401786,
 -45.38250060934351)