### Import Libs

In [1]:
#Standards
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chisquare, zscore
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 70)
%matplotlib inline

#Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics


### Import data and inspect dataframe

In [2]:
df_raw = pd.read_csv('crx.csv')
df_raw.head(100)

Unnamed: 0,a1,a2,s3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16
0,b,30.83,0.000,u,g,w,v,1.250,t,t,1,f,g,00202,0,+
1,a,58.67,4.460,u,g,q,h,3.040,t,t,6,f,g,00043,560,+
2,a,24.50,0.500,u,g,q,h,1.500,t,f,0,f,g,00280,824,+
3,b,27.83,1.540,u,g,w,v,3.750,t,t,5,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.710,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,a,28.58,3.540,u,g,i,bb,0.500,t,f,0,t,g,00171,0,-
96,b,23.00,0.625,y,p,aa,v,0.125,t,f,0,f,g,00180,1,-
97,b,?,0.500,u,g,c,bb,0.835,t,f,0,t,s,00320,0,-
98,a,22.50,11.000,y,p,q,v,3.000,t,f,0,t,g,00268,0,-


In [3]:
#Getting feel for dataframe
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
a1     690 non-null object
a2     690 non-null object
s3     690 non-null float64
a4     690 non-null object
a5     690 non-null object
a6     690 non-null object
a7     690 non-null object
a8     690 non-null float64
a9     690 non-null object
a10    690 non-null object
a11    690 non-null int64
a12    690 non-null object
a13    690 non-null object
a14    690 non-null object
a15    690 non-null int64
a16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [4]:
#Getting feel for dataframe
df_raw.describe()

Unnamed: 0,s3,a8,a11,a15
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [5]:
#Instructions for module say to only use these columns in training: 's3','a8','a9','a10','a11','a12','a13','a15'
#Modifying the dataframe accordingly
df_raw = df_raw[['s3','a8','a9','a10','a11','a12','a13','a15', 'a2', 'a14']]
df_raw.head()

Unnamed: 0,s3,a8,a9,a10,a11,a12,a13,a15,a2,a14
0,0.0,1.25,t,t,1,f,g,0,30.83,202
1,4.46,3.04,t,t,6,f,g,560,58.67,43
2,0.5,1.5,t,f,0,f,g,824,24.5,280
3,1.54,3.75,t,t,5,t,g,3,27.83,100
4,5.625,1.71,t,f,0,f,s,0,20.17,120


In [6]:
#How many missing a2 and a14 rows are there?
a2_missing = df_raw[df_raw['a2'] == '?'].count()[0]
a14_missing = df_raw[df_raw['a14'] == '?'].count()[0]

print('There are ' + str(a2_missing) + ' a2 values missing and ' + str(a14_missing) + ' a14 values missing.')

There are 12 a2 values missing and 13 a14 values missing.


### Data Cleanup / Formatting
Converting categorical variables into dummies

In [7]:
#Create dataframe to be used in neural networks
df_model = df_raw

# Generate dummies for a9
df_model = pd.concat([df_model,pd.get_dummies(df_model['a9'],prefix="a9")],axis=1)
df_model.drop('a9', axis=1, inplace=True)

# Generate dummies for a10
df_model = pd.concat([df_model,pd.get_dummies(df_model['a10'],prefix="a10")],axis=1)
df_model.drop('a10', axis=1, inplace=True)

# Generate dummies for a12
df_model = pd.concat([df_model,pd.get_dummies(df_model['a12'],prefix="a12")],axis=1)
df_model.drop('a12', axis=1, inplace=True)

# Generate dummies for a13
df_model = pd.concat([df_model,pd.get_dummies(df_model['a13'],prefix="a13")],axis=1)
df_model.drop('a13', axis=1, inplace=True)

df_model.head()

Unnamed: 0,s3,a8,a11,a15,a2,a14,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s
0,0.0,1.25,1,0,30.83,202,0,1,0,1,1,0,1,0,0
1,4.46,3.04,6,560,58.67,43,0,1,0,1,1,0,1,0,0
2,0.5,1.5,0,824,24.5,280,0,1,1,0,1,0,1,0,0
3,1.54,3.75,5,3,27.83,100,0,1,0,1,0,1,1,0,0
4,5.625,1.71,0,0,20.17,120,0,1,1,0,1,0,0,0,1


In [8]:
#Creating my primary and holdout sets
a2_X_primary = pd.DataFrame(df_model[df_model['a2'] != '?'].drop(columns= ['a2', 'a14']))
a2_Y_primary = pd.DataFrame(df_model[df_model['a2'] != '?']['a2'])
a2_x_holdout = pd.DataFrame(df_model[df_model['a2'] == '?'].drop(columns= ['a2', 'a14']))
a2_y_holdout = pd.DataFrame(df_model[df_model['a2'] == '?']['a2'])

a14_X_primary = pd.DataFrame(df_model[df_model['a14'] != '?'].drop(columns= ['a2', 'a14']))
a14_Y_primary = pd.DataFrame(df_model[df_model['a14'] != '?']['a14'])
a14_x_holdout = pd.DataFrame(df_model[df_model['a14'] == '?'].drop(columns= ['a2', 'a14']))
a14_y_holdout = pd.DataFrame(df_model[df_model['a14'] == '?']['a14'])

#Removing question mark values from target holdout dataframes
a2_y_holdout = a2_y_holdout.replace('?', '')
a14_y_holdout = a14_y_holdout.replace('?', '')



Standardizing quantitative features here.



In [9]:
#Standardize all these variables
a2_X_primary['s3'] = zscore(a2_X_primary['s3'])
a2_X_primary['a8'] = zscore(a2_X_primary['a8'])
a2_X_primary['a11'] = zscore(a2_X_primary['a11'])
a2_X_primary['a15'] = zscore(a2_X_primary['a15'])

#Inspect updated model dataframe
# a2_X_primary.head()

#Standardize all these variables
a2_x_holdout['s3'] = zscore(a2_x_holdout['s3'])
a2_x_holdout['a8'] = zscore(a2_x_holdout['a8'])
a2_x_holdout['a11'] = zscore(a2_x_holdout['a11'])
a2_x_holdout['a15'] = zscore(a2_x_holdout['a15'])

#Inspect updated model dataframe
# a2_x_holdout.head()

#Standardize all these variables
a14_X_primary['s3'] = zscore(a14_X_primary['s3'])
a14_X_primary['a8'] = zscore(a14_X_primary['a8'])
a14_X_primary['a11'] = zscore(a14_X_primary['a11'])
a14_X_primary['a15'] = zscore(a14_X_primary['a15'])

#Inspect updated model dataframe
# a14_X_primary.head()

#Standardize all these variables
a14_x_holdout['s3'] = zscore(a14_x_holdout['s3'])
a14_x_holdout['a8'] = zscore(a14_x_holdout['a8'])
a14_x_holdout['a11'] = zscore(a14_x_holdout['a11'])
a14_x_holdout['a15'] = zscore(a14_x_holdout['a15'])

#Inspect updated model dataframe
a14_x_holdout.head()

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s
71,-0.048743,3.340367,-0.5,-0.351284,0,1,1,0,0,1,1,0,0
202,-0.316334,0.264533,2.5,-0.26709,0,1,0,1,1,0,1,0,0
206,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0
243,0.700513,0.402571,2.0,3.399006,0,1,0,1,1,0,1,0,0
270,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0


In [10]:
#Train test split
a2_x_train, a2_x_test, a2_y_train, a2_y_test = train_test_split(    
    a2_X_primary, a2_Y_primary, test_size=0.25, random_state=42)

a14_x_train, a14_x_test, a14_y_train, a14_y_test = train_test_split(    
    a14_X_primary, a14_Y_primary, test_size=0.25, random_state=42)

### Building Modeling

We begin with a2 first

In [11]:
# Build neural network: a2
model_a2 = Sequential()
model_a2.add(Dense(25, input_dim=a2_X_primary.shape[1], activation='relu')) # Hidden 1
model_a2.add(Dense(10, activation='relu')) # Hidden 2
model_a2.add(Dense(1)) # Output
model_a2.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)
model_a2.fit(a2_x_train,a2_y_train,validation_data=(a2_x_test,a2_y_test),
          callbacks=[monitor],verbose=2,epochs=1000)

Train on 508 samples, validate on 170 samples
Epoch 1/1000
508/508 - 0s - loss: 1136.3227 - val_loss: 1102.7641
Epoch 2/1000
508/508 - 0s - loss: 1110.7952 - val_loss: 1074.7449
Epoch 3/1000
508/508 - 0s - loss: 1082.1012 - val_loss: 1042.9210
Epoch 4/1000
508/508 - 0s - loss: 1048.5339 - val_loss: 1005.5005
Epoch 5/1000
508/508 - 0s - loss: 1007.6701 - val_loss: 959.9692
Epoch 6/1000
508/508 - 0s - loss: 957.6908 - val_loss: 903.0574
Epoch 7/1000
508/508 - 0s - loss: 895.9024 - val_loss: 835.3571
Epoch 8/1000
508/508 - 0s - loss: 822.3426 - val_loss: 756.5160
Epoch 9/1000
508/508 - 0s - loss: 738.0366 - val_loss: 667.2033
Epoch 10/1000
508/508 - 0s - loss: 645.0674 - val_loss: 570.5864
Epoch 11/1000
508/508 - 0s - loss: 546.0407 - val_loss: 472.7677
Epoch 12/1000
508/508 - 0s - loss: 448.8121 - val_loss: 378.3871
Epoch 13/1000
508/508 - 0s - loss: 357.6690 - val_loss: 295.6578
Epoch 14/1000
508/508 - 0s - loss: 280.2615 - val_loss: 229.1490
Epoch 15/1000
508/508 - 0s - loss: 221.4535 

<tensorflow.python.keras.callbacks.History at 0x1a4348f7d0>

In [12]:
# Measure RMSE error for a2 prediction.  RMSE is common for regression.
pred_a2 = model_a2.predict(a2_x_test)
score_a2 = np.sqrt(metrics.mean_squared_error(pred_a2,a2_y_test))
print(f"Final score (RMSE): {score_a2}")

Final score (RMSE): 10.7324627259521


We'll now model a14 here.

In [13]:
# Build neural network: a14
model_a14 = Sequential()
model_a14.add(Dense(25, input_dim=a2_X_primary.shape[1], activation='relu')) # Hidden 1
model_a14.add(Dense(10, activation='relu')) # Hidden 2
model_a14.add(Dense(1)) # Output
model_a14.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)
model_a14.fit(a2_x_train,a2_y_train,validation_data=(a2_x_test,a2_y_test),
          callbacks=[monitor],verbose=2,epochs=1000)

Train on 508 samples, validate on 170 samples
Epoch 1/1000
508/508 - 0s - loss: 1137.6778 - val_loss: 1108.8598
Epoch 2/1000
508/508 - 0s - loss: 1120.9323 - val_loss: 1087.9014
Epoch 3/1000
508/508 - 0s - loss: 1096.2357 - val_loss: 1055.4495
Epoch 4/1000
508/508 - 0s - loss: 1058.1519 - val_loss: 1008.3428
Epoch 5/1000
508/508 - 0s - loss: 1004.8621 - val_loss: 946.5835
Epoch 6/1000
508/508 - 0s - loss: 936.2861 - val_loss: 868.3352
Epoch 7/1000
508/508 - 0s - loss: 851.3974 - val_loss: 771.6925
Epoch 8/1000
508/508 - 0s - loss: 747.8267 - val_loss: 660.8182
Epoch 9/1000
508/508 - 0s - loss: 630.5491 - val_loss: 540.3999
Epoch 10/1000
508/508 - 0s - loss: 507.0199 - val_loss: 418.9196
Epoch 11/1000
508/508 - 0s - loss: 388.0316 - val_loss: 308.0667
Epoch 12/1000
508/508 - 0s - loss: 284.0918 - val_loss: 220.4796
Epoch 13/1000
508/508 - 0s - loss: 206.7725 - val_loss: 163.2542
Epoch 14/1000
508/508 - 0s - loss: 159.8121 - val_loss: 134.5930
Epoch 15/1000
508/508 - 0s - loss: 138.0988 

<tensorflow.python.keras.callbacks.History at 0x1a45486bd0>

In [14]:
# Measure RMSE error for a14 prediction.  RMSE is common for regression.
pred_a14 = model_a14.predict(a14_x_test)
score_a14 = np.sqrt(metrics.mean_squared_error(pred_a14,a14_y_test))
print(f"Final score (RMSE): {score_a14}")

Final score (RMSE): 274.58457055613917


In [15]:
#How good/bad are 10 and 220 for a2 and a14 RMSE's? Summary stats can give us context
a2_Y_primary.join(a14_Y_primary).astype(float).describe()

Unnamed: 0,a2,a14
count,678.0,666.0
mean,31.568171,182.115616
std,11.957862,171.477919
min,13.75,0.0
25%,22.6025,75.25
50%,28.46,160.0
75%,38.23,271.0
max,80.25,2000.0


### Applying Model
Let's now take this value to predict the missing a2 and a14 values

In [16]:
#Let's predict missing a2/a14 values first
pred_a2_final = model_a2.predict(a2_x_holdout)
pred_a14_final = model_a14.predict(a14_x_holdout)



In [17]:
#Incorporating predictions
a2_y_holdout['a2'] = pred_a2_final
a14_y_holdout['a14'] = pred_a14_final
a2_y_holdout.head()

Unnamed: 0,a2
83,32.481018
86,27.70105
92,41.50758
97,27.668497
254,26.73666


In [18]:
#What does a2 holdout set look like?
a2_x_holdout.join(a2_y_holdout).head(15)

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a2
83,-0.053162,-0.00822,-0.436852,-0.517813,0,1,1,0,0,1,1,0,0,32.481018
86,-0.923717,-0.718802,-0.436852,-0.517813,0,1,1,0,0,1,0,0,1,27.70105
92,0.364704,1.830931,-0.436852,-0.517813,0,1,1,0,1,0,1,0,0,41.50758
97,-0.888894,-0.732177,-0.436852,-0.517813,0,1,1,0,0,1,0,0,1,27.668497
254,-0.854072,-0.927796,-0.436852,0.78387,1,0,1,0,1,0,1,0,0,26.73666
286,-0.610317,-1.011394,1.660038,-0.449815,1,0,0,1,0,1,1,0,0,26.204466
329,0.086127,-0.98297,-0.436852,-0.517813,1,0,1,0,0,1,1,0,0,26.681377
445,2.105814,-1.011394,-0.436852,2.849725,1,0,1,0,1,0,1,0,0,25.632582
450,-0.192451,1.329344,-0.436852,-0.517165,1,0,1,0,1,0,1,0,0,37.017677
500,0.086127,0.660562,2.708483,0.958075,0,1,0,1,0,1,1,0,0,33.883579


In [19]:
#What does a14 holdout set look like?
a14_x_holdout.join(a14_y_holdout).head(15)

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a14
71,-0.048743,3.340367,-0.5,-0.351284,0,1,1,0,0,1,1,0,0,43.431469
202,-0.316334,0.264533,2.5,-0.26709,0,1,0,1,1,0,1,0,0,34.309357
206,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,26.539133
243,0.700513,0.402571,2.0,3.399006,0,1,0,1,1,0,1,0,0,26.705475
270,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,26.539133
278,1.984951,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,1,0,0,31.764843
330,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,26.539133
406,0.834309,-0.361136,0.5,-0.348758,1,0,0,1,1,0,1,0,0,29.815767
445,1.503287,-0.41065,-0.5,0.378399,1,0,1,0,1,0,1,0,0,28.716883
456,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,26.539133
