### Import Libs

In [1]:
#Standards
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chisquare, zscore
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 70)
%matplotlib inline

#Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])



### Import data and inspect dataframe

In [2]:
df_raw = pd.read_csv('crx.csv')
df_raw.head(100)

Unnamed: 0,a1,a2,s3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16
0,b,30.83,0.000,u,g,w,v,1.250,t,t,1,f,g,00202,0,+
1,a,58.67,4.460,u,g,q,h,3.040,t,t,6,f,g,00043,560,+
2,a,24.50,0.500,u,g,q,h,1.500,t,f,0,f,g,00280,824,+
3,b,27.83,1.540,u,g,w,v,3.750,t,t,5,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.710,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,a,28.58,3.540,u,g,i,bb,0.500,t,f,0,t,g,00171,0,-
96,b,23.00,0.625,y,p,aa,v,0.125,t,f,0,f,g,00180,1,-
97,b,?,0.500,u,g,c,bb,0.835,t,f,0,t,s,00320,0,-
98,a,22.50,11.000,y,p,q,v,3.000,t,f,0,t,g,00268,0,-


In [3]:
#Getting feel for dataframe
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
a1     690 non-null object
a2     690 non-null object
s3     690 non-null float64
a4     690 non-null object
a5     690 non-null object
a6     690 non-null object
a7     690 non-null object
a8     690 non-null float64
a9     690 non-null object
a10    690 non-null object
a11    690 non-null int64
a12    690 non-null object
a13    690 non-null object
a14    690 non-null object
a15    690 non-null int64
a16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [4]:
#Getting feel for dataframe
df_raw.describe()

Unnamed: 0,s3,a8,a11,a15
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [5]:
#Instructions for module say to only use these columns in training: 's3','a8','a9','a10','a11','a12','a13','a15'
#Modifying the dataframe accordingly
df_raw = df_raw[['s3','a8','a9','a10','a11','a12','a13','a15', 'a2', 'a14']]
df_raw.head()

Unnamed: 0,s3,a8,a9,a10,a11,a12,a13,a15,a2,a14
0,0.0,1.25,t,t,1,f,g,0,30.83,202
1,4.46,3.04,t,t,6,f,g,560,58.67,43
2,0.5,1.5,t,f,0,f,g,824,24.5,280
3,1.54,3.75,t,t,5,t,g,3,27.83,100
4,5.625,1.71,t,f,0,f,s,0,20.17,120


In [6]:
#How many missing a2 and a14 rows are there?
a2_missing = df_raw[df_raw['a2'] == '?'].count()[0]
a14_missing = df_raw[df_raw['a14'] == '?'].count()[0]

print('There are ' + str(a2_missing) + ' a2 values missing and ' + str(a14_missing) + ' a14 values missing.')

There are 12 a2 values missing and 13 a14 values missing.


### Data Cleanup / Formatting
Converting categorical variables into dummies

In [7]:
#Create dataframe to be used in neural networks
df_model = df_raw

# Generate dummies for a9
df_model = pd.concat([df_model,pd.get_dummies(df_model['a9'],prefix="a9")],axis=1)
df_model.drop('a9', axis=1, inplace=True)

# Generate dummies for a10
df_model = pd.concat([df_model,pd.get_dummies(df_model['a10'],prefix="a10")],axis=1)
df_model.drop('a10', axis=1, inplace=True)

# Generate dummies for a12
df_model = pd.concat([df_model,pd.get_dummies(df_model['a12'],prefix="a12")],axis=1)
df_model.drop('a12', axis=1, inplace=True)

# Generate dummies for a13
df_model = pd.concat([df_model,pd.get_dummies(df_model['a13'],prefix="a13")],axis=1)
df_model.drop('a13', axis=1, inplace=True)

df_model.head()

Unnamed: 0,s3,a8,a11,a15,a2,a14,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s
0,0.0,1.25,1,0,30.83,202,0,1,0,1,1,0,1,0,0
1,4.46,3.04,6,560,58.67,43,0,1,0,1,1,0,1,0,0
2,0.5,1.5,0,824,24.5,280,0,1,1,0,1,0,1,0,0
3,1.54,3.75,5,3,27.83,100,0,1,0,1,0,1,1,0,0
4,5.625,1.71,0,0,20.17,120,0,1,1,0,1,0,0,0,1


In [8]:
#Creating my primary and holdout sets
a2_X_primary = pd.DataFrame(df_model[df_model['a2'] != '?'].drop(columns= ['a2', 'a14']))
a2_Y_primary = pd.DataFrame(df_model[df_model['a2'] != '?']['a2'])
a2_x_holdout = pd.DataFrame(df_model[df_model['a2'] == '?'].drop(columns= ['a2', 'a14']))
a2_y_holdout = pd.DataFrame(df_model[df_model['a2'] == '?']['a2'])

a14_X_primary = pd.DataFrame(df_model[df_model['a14'] != '?'].drop(columns= ['a2', 'a14']))
a14_Y_primary = pd.DataFrame(df_model[df_model['a14'] != '?']['a14'])
a14_x_holdout = pd.DataFrame(df_model[df_model['a14'] == '?'].drop(columns= ['a2', 'a14']))
a14_y_holdout = pd.DataFrame(df_model[df_model['a14'] == '?']['a14'])

#Removing question mark values from target holdout dataframes
a2_y_holdout = a2_y_holdout.replace('?', '')
a14_y_holdout = a14_y_holdout.replace('?', '')

Standardizing quantitative features here.

_(Aside: Should look into different ways of standardizing/normalizing and how they compare)_

In [9]:
#Standardize all these variables
a2_X_primary['s3'] = zscore(a2_X_primary['s3'])
a2_X_primary['a8'] = zscore(a2_X_primary['a8'])
a2_X_primary['a11'] = zscore(a2_X_primary['a11'])
a2_X_primary['a15'] = zscore(a2_X_primary['a15'])

#Inspect updated model dataframe
# a2_X_primary.head()

#Standardize all these variables
a2_x_holdout['s3'] = zscore(a2_x_holdout['s3'])
a2_x_holdout['a8'] = zscore(a2_x_holdout['a8'])
a2_x_holdout['a11'] = zscore(a2_x_holdout['a11'])
a2_x_holdout['a15'] = zscore(a2_x_holdout['a15'])

#Inspect updated model dataframe
# a2_x_holdout.head()

#Standardize all these variables
a14_X_primary['s3'] = zscore(a14_X_primary['s3'])
a14_X_primary['a8'] = zscore(a14_X_primary['a8'])
a14_X_primary['a11'] = zscore(a14_X_primary['a11'])
a14_X_primary['a15'] = zscore(a14_X_primary['a15'])

#Inspect updated model dataframe
# a14_X_primary.head()

#Standardize all these variables
a14_x_holdout['s3'] = zscore(a14_x_holdout['s3'])
a14_x_holdout['a8'] = zscore(a14_x_holdout['a8'])
a14_x_holdout['a11'] = zscore(a14_x_holdout['a11'])
a14_x_holdout['a15'] = zscore(a14_x_holdout['a15'])

#Inspect updated model dataframe
a14_x_holdout.head()

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s
71,-0.048743,3.340367,-0.5,-0.351284,0,1,1,0,0,1,1,0,0
202,-0.316334,0.264533,2.5,-0.26709,0,1,0,1,1,0,1,0,0
206,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0
243,0.700513,0.402571,2.0,3.399006,0,1,0,1,1,0,1,0,0
270,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0


In [10]:
#Train test split
a2_x_train, a2_x_test, a2_y_train, a2_y_test = train_test_split(    
    a2_X_primary, a2_Y_primary, test_size=0.25, random_state=42)

a14_x_train, a14_x_test, a14_y_train, a14_y_test = train_test_split(    
    a14_X_primary, a14_Y_primary, test_size=0.25, random_state=42)

### Building Modeling

We begin with a2 first

In [11]:
# Build neural network: a2
model_a2 = Sequential()
model_a2.add(Dense(25, input_dim=a2_X_primary.shape[1], activation='relu')) # Hidden 1
model_a2.add(Dense(10, activation='relu')) # Hidden 2
model_a2.add(Dense(1)) # Output
model_a2.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)
model_a2.fit(a2_x_train,a2_y_train,validation_data=(a2_x_test,a2_y_test),
          callbacks=[monitor],verbose=2,epochs=1000)

Train on 508 samples, validate on 170 samples
Epoch 1/1000
508/508 - 0s - loss: 1114.6464 - val_loss: 1076.5874
Epoch 2/1000
508/508 - 0s - loss: 1082.4063 - val_loss: 1040.3066
Epoch 3/1000
508/508 - 0s - loss: 1043.0990 - val_loss: 994.3759
Epoch 4/1000
508/508 - 0s - loss: 992.2993 - val_loss: 935.9515
Epoch 5/1000
508/508 - 0s - loss: 927.2479 - val_loss: 862.2510
Epoch 6/1000
508/508 - 0s - loss: 845.3450 - val_loss: 770.3979
Epoch 7/1000
508/508 - 0s - loss: 744.1441 - val_loss: 659.5172
Epoch 8/1000
508/508 - 0s - loss: 625.1078 - val_loss: 536.4623
Epoch 9/1000
508/508 - 0s - loss: 496.6584 - val_loss: 413.0061
Epoch 10/1000
508/508 - 0s - loss: 374.4203 - val_loss: 303.0995
Epoch 11/1000
508/508 - 0s - loss: 269.3188 - val_loss: 223.9401
Epoch 12/1000
508/508 - 0s - loss: 198.9026 - val_loss: 176.3962
Epoch 13/1000
508/508 - 0s - loss: 159.9352 - val_loss: 156.1394
Epoch 14/1000
508/508 - 0s - loss: 143.2192 - val_loss: 149.1034
Epoch 15/1000
508/508 - 0s - loss: 135.8050 - va

<tensorflow.python.keras.callbacks.History at 0x1a383a1748>

In [12]:
# Measure RMSE error for a2 prediction.  RMSE is common for regression.
pred_a2 = model_a2.predict(a2_x_test)
score_a2 = np.sqrt(metrics.mean_squared_error(pred_a2,a2_y_test))
print(f"Final score (RMSE): {score_a2}")

Final score (RMSE): 11.226337556740939


Need to fill in missing values here

We'll now model a14 here.

In [13]:
# Build neural network: a14
model_a14 = Sequential()
model_a14.add(Dense(25, input_dim=a2_X_primary.shape[1], activation='relu')) # Hidden 1
model_a14.add(Dense(10, activation='relu')) # Hidden 2
model_a14.add(Dense(1)) # Output
model_a14.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)
model_a14.fit(a2_x_train,a2_y_train,validation_data=(a2_x_test,a2_y_test),
          callbacks=[monitor],verbose=2,epochs=1000)

Train on 508 samples, validate on 170 samples
Epoch 1/1000
508/508 - 0s - loss: 1167.7754 - val_loss: 1135.4847
Epoch 2/1000
508/508 - 0s - loss: 1147.3871 - val_loss: 1117.2529
Epoch 3/1000
508/508 - 0s - loss: 1130.1058 - val_loss: 1099.2595
Epoch 4/1000
508/508 - 0s - loss: 1111.7541 - val_loss: 1077.6355
Epoch 5/1000
508/508 - 0s - loss: 1088.4955 - val_loss: 1049.8007
Epoch 6/1000
508/508 - 0s - loss: 1057.6020 - val_loss: 1012.4457
Epoch 7/1000
508/508 - 0s - loss: 1016.1465 - val_loss: 963.9630
Epoch 8/1000
508/508 - 0s - loss: 963.0915 - val_loss: 901.9601
Epoch 9/1000
508/508 - 0s - loss: 895.8084 - val_loss: 826.0473
Epoch 10/1000
508/508 - 0s - loss: 814.2906 - val_loss: 735.9754
Epoch 11/1000
508/508 - 0s - loss: 719.8552 - val_loss: 633.7847
Epoch 12/1000
508/508 - 0s - loss: 614.7442 - val_loss: 526.1752
Epoch 13/1000
508/508 - 0s - loss: 505.6020 - val_loss: 419.4463
Epoch 14/1000
508/508 - 0s - loss: 401.1586 - val_loss: 320.1300
Epoch 15/1000
508/508 - 0s - loss: 308.3

<tensorflow.python.keras.callbacks.History at 0x1a38e7af28>

In [14]:
# Measure RMSE error for a14 prediction.  RMSE is common for regression.
pred_a14 = model_a14.predict(a14_x_test)
score_a14 = np.sqrt(metrics.mean_squared_error(pred_a14,a14_y_test))
print(f"Final score (RMSE): {score_a14}")

Final score (RMSE): 274.4149545741255


In [15]:
#How good/bad are 10 and 220 for a2 and a14 RMSE's?
a2_Y_primary.join(a14_Y_primary).astype(float).describe()

Unnamed: 0,a2,a14
count,678.0,666.0
mean,31.568171,182.115616
std,11.957862,171.477919
min,13.75,0.0
25%,22.6025,75.25
50%,28.46,160.0
75%,38.23,271.0
max,80.25,2000.0


### Applying Model
Let's now take this value to predict the missing a2 and a14 values

In [16]:
#Let's predict missing a2/a14 values first
pred_a2_final = model_a2.predict(a2_x_holdout)
pred_a14_final = model_a14.predict(a14_x_holdout)

In [17]:
#Incorporating predictions
a2_y_holdout['a2'] = pred_a2_final
a14_y_holdout['a14'] = pred_a14_final
a2_y_holdout.head()

Unnamed: 0,a2
83,32.917084
86,29.065464
92,40.548641
97,28.973915
254,28.080544


In [18]:
#What does a2 holdout set look like?
a2_x_holdout.join(a2_y_holdout).head(15)

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a2
83,-0.053162,-0.00822,-0.436852,-0.517813,0,1,1,0,0,1,1,0,0,32.917084
86,-0.923717,-0.718802,-0.436852,-0.517813,0,1,1,0,0,1,0,0,1,29.065464
92,0.364704,1.830931,-0.436852,-0.517813,0,1,1,0,1,0,1,0,0,40.548641
97,-0.888894,-0.732177,-0.436852,-0.517813,0,1,1,0,0,1,0,0,1,28.973915
254,-0.854072,-0.927796,-0.436852,0.78387,1,0,1,0,1,0,1,0,0,28.080544
286,-0.610317,-1.011394,1.660038,-0.449815,1,0,0,1,0,1,1,0,0,25.188103
329,0.086127,-0.98297,-0.436852,-0.517813,1,0,1,0,0,1,1,0,0,27.21743
445,2.105814,-1.011394,-0.436852,2.849725,1,0,1,0,1,0,1,0,0,31.81731
450,-0.192451,1.329344,-0.436852,-0.517165,1,0,1,0,1,0,1,0,0,35.391869
500,0.086127,0.660562,2.708483,0.958075,0,1,0,1,0,1,1,0,0,35.29208


In [19]:
#What does a14 holdout set look like?
a14_x_holdout.join(a14_y_holdout).head(15)

Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a14
71,-0.048743,3.340367,-0.5,-0.351284,0,1,1,0,0,1,1,0,0,45.510384
202,-0.316334,0.264533,2.5,-0.26709,0,1,0,1,1,0,1,0,0,32.980476
206,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,27.943163
243,0.700513,0.402571,2.0,3.399006,0,1,0,1,1,0,1,0,0,30.249559
270,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,27.943163
278,1.984951,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,1,0,0,31.867796
330,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,27.943163
406,0.834309,-0.361136,0.5,-0.348758,1,0,0,1,1,0,1,0,0,31.025723
445,1.503287,-0.41065,-0.5,0.378399,1,0,1,0,1,0,1,0,0,30.662334
456,-0.905035,-0.41065,-0.5,-0.351284,1,0,1,0,1,0,0,1,0,27.943163


### Appendix 

Chi square distribution is appropriate here for standardization. The level of measure of all the variables is nominal and the distribution of the data was seriously skewed for each variable (see [Introduction](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3900058/) of this abstract). 

Because a2 and a14 are the response variables that we're trying to fill in missing values for, this is probably a good time to begin partitioning our data into training and test data sets. I'm using the learnings from [this article](https://towardsdatascience.com/handling-missing-values-in-machine-learning-part-2-222154b4b58e) to define my train/test splits here.