In [1]:
import pandas as pd 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

data = pd.read_csv("2008_births-Copy1.csv")
data


Unnamed: 0,INST,RPLACE,RCOUNTY,PLURAL,BDATE,BMONTH,BDAY,BYEAR,SEX,RACE,...,MOTHERTR,IANEMIA,BINJURY,FAS,HYALINE,ASPIRATE,VENTLESS,VENTMORE,ISEIZURE,OTHINF
0,1,6800,68,1,2008-01-01,1,1,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
1,1,160,1,1,2008-01-02,1,2,2008,2,2,...,2,0,0,0,0,0,0,0,0,0
2,1,190,1,1,2008-01-02,1,2,2008,1,1,...,2,0,0,0,0,0,0,0,0,0
3,1,4100,41,1,2008-01-03,1,3,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
4,1,160,1,1,2008-01-03,1,3,2008,2,1,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133417,1,2000,20,1,2008-12-19,12,19,2008,1,1,...,2,9,9,9,9,9,9,9,9,9
133418,1,2000,20,1,2008-12-22,12,22,2008,2,1,...,2,9,9,9,9,9,9,9,9,9
133419,1,2600,26,1,2008-12-26,12,26,2008,1,1,...,2,9,9,9,9,9,9,9,9,9
133420,1,2000,20,1,2008-12-30,12,30,2008,2,1,...,2,9,9,9,9,9,9,9,9,9


In [2]:
import numpy as np 
from sklearn.model_selection import train_test_split


In [3]:
#birthweight is what we want to predict - change this to single target 
birth_weight = data[['BPOUND', 'BOUNCE']] 
birth_weight

Unnamed: 0,BPOUND,BOUNCE
0,4,1
1,8,3
2,9,0
3,7,6
4,9,7
...,...,...
133417,6,8
133418,9,2
133419,8,7
133420,5,13


In [4]:
#PCA might be a good technique to select predictors 

#note that PCA performs best when data is normalized (range b/w 0 and 1)

#It is possible to use categorical and continuous predictors 
#for a regression problem. My understanding is you need to make 
#dummy variables for the binary predictors. 

#Variables that we will need to deal with: 
# BDATE, HISPMOM, HISPDAD

In [5]:
#Attempting PCA on data
#for now I drop the BDATE, HISPMOM AND HISPDAD
data_drop = data.drop(["BDATE", "HISPMOM", "HISPDAD", "BOUNCE", "BPOUND"], axis = 1) #axis = 1 means to drop column not row
data_drop.head()

Unnamed: 0,INST,RPLACE,RCOUNTY,PLURAL,BMONTH,BDAY,BYEAR,SEX,RACE,FAGE,...,MOTHERTR,IANEMIA,BINJURY,FAS,HYALINE,ASPIRATE,VENTLESS,VENTMORE,ISEIZURE,OTHINF
0,1,6800,68,1,1,1,2008,2,1,23,...,2,0,0,0,0,0,0,0,0,0
1,1,160,1,1,1,2,2008,2,2,23,...,2,0,0,0,0,0,0,0,0,0
2,1,190,1,1,1,2,2008,1,1,31,...,2,0,0,0,0,0,0,0,0,0
3,1,4100,41,1,1,3,2008,2,1,21,...,2,0,0,0,0,0,0,0,0,0
4,1,160,1,1,1,3,2008,2,1,26,...,2,0,0,0,0,0,0,0,0,0


In [6]:
#get a list of columns in pandas object 
names_of_data = data_drop.columns.tolist()

#shuffle = false prevents data split being different everytime
X_train, X_test, y_train, y_test = train_test_split(data_drop, birth_weight, test_size = 0.2, shuffle = False)

#split test into validate and test, again making sure the data is always the same for consistency
#X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.25, shuffle = False)

#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#running the actual PCA
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

#relief f algorithm - sorting features 

In [7]:
explained_variance = pca.explained_variance_ratio_
#print(len(explained_variance))
#print(explained_variance)

In [8]:
#Explained variance prints the variance each principal component contributes.
#As we can see, the last 5 contribute very little (maybe we can get rid of?)

#We also want to check for linearity between the input predictors and the output 
#If there is high colinearity, then we want to use ridge regression - A variant of lin regression that has regulatization

#Correlation indicates strength and direction of a linear relationship. let's use this on the predictors 
y_train

Unnamed: 0,BPOUND,BOUNCE
0,4,1
1,8,3
2,9,0
3,7,6
4,9,7
...,...,...
106732,6,8
106733,7,12
106734,6,10
106735,7,6


In [9]:
#First type of model on this data! 
#add up pounds and ounces

y_train = y_train['BPOUND'] + y_train['BOUNCE']*0.0625
y_test = y_test['BPOUND'] + y_test['BOUNCE']*0.0625

#lets use a neural network for this example
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(120, input_dim=120, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X_test, y_test, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Standardized: -151119399564005733311264587776.00 (453358198692017164749421674496.00) MSE


In [10]:
predicted_y = lin_reg_mod.predict(X_test)


NameError: name 'lin_reg_mod' is not defined

In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
predicted_y = ridge.predict(X_test)

In [None]:
mean_squared_error(y_test, predicted_y)