# Ridge Regressor

In [2]:
import pandas as pd 

data = pd.read_csv("../data/2008_births.csv")
data


Unnamed: 0,INST,RPLACE,RCOUNTY,PLURAL,BDATE,BMONTH,BDAY,BYEAR,SEX,RACE,...,MOTHERTR,IANEMIA,BINJURY,FAS,HYALINE,ASPIRATE,VENTLESS,VENTMORE,ISEIZURE,OTHINF
0,1,6800,68,1,2008-01-01,1,1,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
1,1,160,1,1,2008-01-02,1,2,2008,2,2,...,2,0,0,0,0,0,0,0,0,0
2,1,190,1,1,2008-01-02,1,2,2008,1,1,...,2,0,0,0,0,0,0,0,0,0
3,1,4100,41,1,2008-01-03,1,3,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
4,1,160,1,1,2008-01-03,1,3,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133417,1,2000,20,1,2008-12-19,12,19,2008,1,1,...,2,9,9,9,9,9,9,9,9,9
133418,1,2000,20,1,2008-12-22,12,22,2008,2,1,...,2,9,9,9,9,9,9,9,9,9
133419,1,2600,26,1,2008-12-26,12,26,2008,1,1,...,2,9,9,9,9,9,9,9,9,9
133420,1,2000,20,1,2008-12-30,12,30,2008,2,1,...,2,9,9,9,9,9,9,9,9,9


In [3]:
import numpy as np 
from sklearn.model_selection import train_test_split


In [4]:
#birthweight is what we want to predict - change this to single target 
birth_weight = data[['BPOUND', 'BOUNCE']] 
birth_weight = birth_weight.assign(total_weight = lambda x: birth_weight['BPOUND'] + (birth_weight['BOUNCE']/16))
birth_weight = birth_weight.drop(['BPOUND', 'BOUNCE'], axis = 1) 
birth_weight

Unnamed: 0,total_weight
0,4.0625
1,8.1875
2,9.0000
3,7.3750
4,9.4375
...,...
133417,6.5000
133418,9.1250
133419,8.4375
133420,5.8125


In [5]:
#PCA might be a good technique to select predictors 

#note that PCA performs best when data is normalized (range b/w 0 and 1)

#It is possible to use categorical and continuous predictors 
#for a regression problem. My understanding is you need to make 
#dummy variables for the binary predictors. 

#Variables that we will need to deal with: 
# BDATE, HISPMOM, HISPDAD

In [6]:
#Attempting PCA on data
#for now I drop the BDATE, HISPMOM AND HISPDAD
data_drop = data.drop(["BDATE", "HISPMOM", "HISPDAD", "BOUNCE", "BPOUND"], axis = 1) #axis = 1 means to drop column not row

In [7]:
#get a list of columns in pandas object 
names_of_data = data_drop.columns.tolist()

#shuffle = false prevents data split being different everytime
X_train, X_test, y_train, y_test = train_test_split(data_drop, birth_weight, test_size = 0.3, shuffle = False)

#split test into validate and test, again making sure the data is always the same for consistency
##X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle = False)

#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#running the actual PCA
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

#relief f algorithm - sorting features 

In [8]:
explained_variance = pca.explained_variance_ratio_
print(len(explained_variance))
print(explained_variance)

120
[3.87288924e-02 3.55339310e-02 3.03713410e-02 2.81691759e-02
 2.41216889e-02 2.07180942e-02 1.81372550e-02 1.70972049e-02
 1.69290075e-02 1.60150783e-02 1.56583072e-02 1.35738394e-02
 1.33303908e-02 1.13091674e-02 1.11939876e-02 1.09928757e-02
 1.06871775e-02 1.03259159e-02 1.02796676e-02 1.01730148e-02
 1.00345282e-02 9.84745605e-03 9.83152893e-03 9.62060504e-03
 9.52270822e-03 9.42600068e-03 9.25986763e-03 9.21933586e-03
 9.20086644e-03 9.14330761e-03 9.06696393e-03 9.01414255e-03
 8.90688724e-03 8.86024117e-03 8.82981455e-03 8.79801322e-03
 8.72969271e-03 8.69514787e-03 8.66374597e-03 8.62414125e-03
 8.61977642e-03 8.56876251e-03 8.54604923e-03 8.53030183e-03
 8.49909388e-03 8.49553427e-03 8.41774283e-03 8.37568421e-03
 8.33155392e-03 8.32606065e-03 8.30919675e-03 8.26475489e-03
 8.23807403e-03 8.20821670e-03 8.17471653e-03 8.15917081e-03
 8.10378417e-03 8.08035458e-03 8.05437293e-03 8.04166826e-03
 7.96726476e-03 7.95313153e-03 7.92987833e-03 7.87519591e-03
 7.81298857e-03 7.78

In [9]:
#Explained variance prints the variance each principal component contributes.
#As we can see, the last 5 contribute very little (maybe we can get rid of?)

#We also want to check for linearity between the input predictors and the output 
#If there is high colinearity, then we want to use ridge regression - A variant of lin regression that has regulatization

#Correlation indicates strength and direction of a linear relationship. let's use this on the predictors 

In [10]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha = 1.0)

In [11]:
rr.fit(X_train, y_train)
print(type(X_train))

<class 'numpy.ndarray'>


In [32]:
y_pred = rr.predict(X_test)
y_pred

array([[ 7.22753045],
       [ 7.20978981],
       [ 7.21724354],
       ...,
       [12.7359363 ],
       [10.53378616],
       [11.65627366]])

In [33]:
y_test

Unnamed: 0,total_weight
26684,7.5000
26685,7.0000
26686,7.3750
26687,8.5625
26688,4.8750
...,...
133417,6.5000
133418,9.1250
133419,8.4375
133420,5.8125


In [34]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test, y_pred)

0.1412431166781336