In [61]:



%matplotlib inline
import csv
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.preprocessing import (MinMaxScaler, RobustScaler)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import Adam

from fancyimpute import KNN,  SoftImpute, IterativeImputer, BiScaler, MatrixFactorization
import util.data
rng = np.random.RandomState(42)


In [16]:
train_data, test_data = util.data.load()

Training Data: 
  Amount of features: 888
  Amount of observations: 1212
  Min age: 42.0 Max age: 96.0

Test Data: 
  Amount of observations: 776


# Use various ways to fill the nans

In [17]:
X_incomplete = train_data.drop(["id", "y"], axis=1).values
X = {}


k = 12
# Use nearest rows which have a feature to fill in each row's missing features
X["KNN"] = KNN(k=k).fit_transform(X_incomplete)


# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X["SoftImpute"] = SoftImpute().fit_transform(X_incomplete_normalized)

X["MatrixFactorization"] = MatrixFactorization(learning_rate= 0.001, rank=50).fit_transform(X_incomplete)


train_mean_values = train_data.mean()
train_data_mean =  train_data.fillna(train_mean_values)
X["Mean"] = train_data_mean.drop(["id", "y"], axis=1).values






Imputing row 1/1212 with 66 missing, elapsed time: 7.865
Imputing row 101/1212 with 60 missing, elapsed time: 7.944
Imputing row 201/1212 with 68 missing, elapsed time: 8.027
Imputing row 301/1212 with 68 missing, elapsed time: 8.117
Imputing row 401/1212 with 63 missing, elapsed time: 8.201
Imputing row 501/1212 with 66 missing, elapsed time: 8.279
Imputing row 601/1212 with 77 missing, elapsed time: 8.364
Imputing row 701/1212 with 65 missing, elapsed time: 8.445
Imputing row 801/1212 with 61 missing, elapsed time: 8.529
Imputing row 901/1212 with 73 missing, elapsed time: 8.620
Imputing row 1001/1212 with 58 missing, elapsed time: 8.702
Imputing row 1101/1212 with 73 missing, elapsed time: 8.784
Imputing row 1201/1212 with 53 missing, elapsed time: 8.871
[BiScaler] Initial log residual value = 81.892772
[BiScaler] Iter 1: log residual = 1.651141, log improvement ratio=80.241632
[BiScaler] Iter 2: log residual = 0.995476, log improvement ratio=0.655665
[BiScaler] Iter 3: log residual

Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoch 68/10000
Epoch 69/10000
Epoch 70/10000
Epoch 71/10000
Epoch 72/10000
Epoch 73/10000
Epoch 74/10000
Epoch 75/10000
Epoch 76/10000
Epoch 77/10000
Epoch 78/10000
Epoch 79/10000
Epoch 80/10000
Epoch 81/10000
Epoch 82/10000
Epoch 83/10000
Epoch 84/10000
Epoch 85/10000
Epoch 86/10000
Epoch 87/10000
Epoch 88/10000
Epoch 89/10000
Epoch 90/10000
Epoch 91/10000
Epoch 92/10000
Epoch 93/10000
Epoch 94/10000
Epoch 95/10000
Epoch 96/10000
Epoch 97/1

Epoch 106/10000
Epoch 107/10000
Epoch 108/10000
Epoch 109/10000
Epoch 110/10000
Epoch 111/10000
Epoch 112/10000
Epoch 113/10000
Epoch 114/10000
Epoch 115/10000
Epoch 116/10000
Epoch 117/10000
Epoch 118/10000
Epoch 119/10000
Epoch 120/10000
Epoch 121/10000
Epoch 122/10000
Epoch 123/10000
Epoch 124/10000
Epoch 125/10000
Epoch 126/10000
Epoch 127/10000
Epoch 128/10000
Epoch 129/10000
Epoch 130/10000
Epoch 131/10000
Epoch 132/10000
Epoch 133/10000
Epoch 134/10000
Epoch 135/10000
Epoch 136/10000
Epoch 137/10000
Epoch 138/10000
Epoch 139/10000
Epoch 140/10000
Epoch 141/10000
Epoch 142/10000
Epoch 143/10000
Epoch 144/10000
Epoch 145/10000
Epoch 146/10000
Epoch 147/10000
Epoch 148/10000
Epoch 149/10000
Epoch 150/10000
Epoch 151/10000
Epoch 152/10000
Epoch 153/10000
Epoch 154/10000
Epoch 155/10000
Epoch 156/10000
Epoch 157/10000
Epoch 158/10000
Epoch 159/10000
Epoch 160/10000
Epoch 161/10000
Epoch 162/10000
Epoch 163/10000
Epoch 164/10000
Epoch 165/10000


In [37]:
Y = {}
y = train_data["y"].values
Y["KNN"] = y
Y["SoftImpute"] = y
Y["MatrixFactorization"] = y
Y["Mean"] = y


# Remove the outliers

In [26]:
# fit the model compute the otliers
outliers = {}
isoForest = IsolationForest(behaviour='new', max_samples=100, random_state=rng, contamination='auto')
for key in X:
    outliers[key] = isoForest.fit_predict(X[key])
print(outliers)


{'KNN': array([ 1, -1,  1, ...,  1,  1,  1]), 'SoftImpute': array([1, 1, 1, ..., 1, 1, 1]), 'MatrixFactorization': array([1, 1, 1, ..., 1, 1, 1]), 'Mean': array([1, 1, 1, ..., 1, 1, 1])}


In [38]:
# remove the outlier
for key in X:
    X[key] = X[key][np.where(outliers[key] > 0)]
    Y[key] = Y[key][np.where(outliers[key] > 0)]
    

# Scale the data 

In [41]:
for key in X:
    transformer = RobustScaler().fit(X[key])
    X[key] = transformer.transform(X[key])

# Compute the feature ranking for each imputer

In [43]:
# Define dictionary to store our rankings
ranks = {}
for key in X:
    ranks[key] = {}

# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

In [45]:
colnames = train_data.drop(["id", "y"], axis=1).columns

rlasso = RandomizedLasso(alpha=0.04)
lr = LinearRegression(normalize=True)
ridge = Ridge(alpha = 7)
lasso = Lasso(alpha=.05)
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50)


for key in X:
    print("Computing for {}".format(key))
    rlasso.fit(X[key], Y[key])
    ranks[key]["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames)
    print("rLasso done")
    # Construct our Linear Regression model

    lr.fit(X[key], Y[key])
    ranks[key]["LinReg"] = ranking(np.abs(lr.coef_), colnames)
    print("LinReg done")
    
    #stop the search when only the last feature is left
    rfe = RFE(lr, n_features_to_select=1 )
    rfe.fit(X[key], Y[key])
    ranks[key]["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
    print("RFE done")
    
    ridge.fit(X[key],Y[key])
    ranks[key]['Ridge'] = ranking(np.abs(ridge.coef_), colnames)
    print("Ridge done")
    
    lasso.fit(X[key], Y[key])
    ranks[key]["Lasso"] = ranking(np.abs(lasso.coef_), colnames)
    print("Lasso done")
    
    rf.fit(X[key],Y[key])
    ranks[key]["RF"] = ranking(rf.feature_importances_, colnames);
    print("RF done")
    
print("Done")



Computing for KNN




Computing for SoftImpute
Computing for MatrixFactorization




Computing for Mean




Done


In [46]:
# Create empty dictionary to store the mean value calculated from all the scores
r = {}
meanplot = {}
for key in X:
    r[key] = {}    
    for name in colnames:
        r[key][name] = round(np.mean([ranks[key][method][name] for method in ranks[key].keys()]), 2)
 
    methods = sorted(ranks[key].keys())
    ranks[key]["Mean"] = r[key]
    methods.append("Mean")
    
    # Put the mean scores into a Pandas dataframe
    meanplot[key] = pd.DataFrame(list(r[key].items()), columns= ['Feature','Mean Ranking'])

    # Sort the dataframe
    meanplot[key] = meanplot[key].sort_values('Mean Ranking', ascending=False)

In [60]:
# use the first 50 features
n_features_to_use = 50
combined_feature_index = set()
X_reduced_dim = {}
for key in meanplot:
    feature_list = list(meanplot[key].head(n_features_to_use)["Feature"].values) 
    combined_feature_index.update(feature_list)
    X_reduced_dim[key] = pd.DataFrame(X[key], columns=colnames)
    X_reduced_dim[key] = X_reduced_dim[key][feature_list]
    X_reduced_dim[key]["y"] = Y[key]
    X_reduced_dim[key].to_csv("preprocessed/" + key + ".csv", index=False)
    #X_reduced_dim[key] = X
    #train_subset = train_data_mean[feature_list + ["id", "y"]]
    


In [62]:
# create model
model = Sequential()
model.add(Dense(n_features_to_use, input_dim=50, init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(50, init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(100, init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(1, init='RandomUniform', activation='sigmoid'))
# Compile model
optimizer = Adam(lr=0.000005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='mean_absolute_error', optimizer=optimizer)



  This is separate from the ipykernel package so we can avoid doing imports until
  """
  import sys
  if __name__ == '__main__':


# load test data and compute same preprocessin steps

In [64]:
X_test_incomplete = test_data.drop(["id"], axis=1).values
X_test = {}
X_test_reduced_dim = {}

k = 12
# Use nearest rows which have a feature to fill in each row's missing features
X_test["KNN"] = KNN(k=k).fit_transform(X_test_incomplete)
X_test["SoftImpute"] = SoftImpute().fit_transform(BiScaler().fit_transform(X_test_incomplete))
X_test["MatrixFactorization"] = MatrixFactorization(learning_rate= 0.001, rank=50).fit_transform(X_test_incomplete)
# use the training mean values
test_data_mean =  test_data.fillna(train_mean_values)
X_test["Mean"] = test_data_mean.drop(["id"], axis=1).values


Imputing row 1/776 with 63 missing, elapsed time: 3.032
Imputing row 101/776 with 54 missing, elapsed time: 3.100
Imputing row 201/776 with 63 missing, elapsed time: 3.167
Imputing row 301/776 with 54 missing, elapsed time: 3.241
Imputing row 401/776 with 65 missing, elapsed time: 3.326
Imputing row 501/776 with 53 missing, elapsed time: 3.406
Imputing row 601/776 with 71 missing, elapsed time: 3.491
Imputing row 701/776 with 70 missing, elapsed time: 3.559
[BiScaler] Initial log residual value = 81.909061
[BiScaler] Iter 1: log residual = 0.999783, log improvement ratio=80.909279
[BiScaler] Iter 2: log residual = -0.076421, log improvement ratio=1.076204
[BiScaler] Iter 3: log residual = -1.423854, log improvement ratio=1.347433
[BiScaler] Iter 4: log residual = -2.871885, log improvement ratio=1.448031
[BiScaler] Iter 5: log residual = -4.353028, log improvement ratio=1.481143
[BiScaler] Iter 6: log residual = -5.845094, log improvement ratio=1.492065
[BiScaler] Iter 7: log residual 

Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoch 68/10000
Epoch 69/10000
Epoch 70/10000
Epoch 71/10000
Epoch 72/10000
Epoch 73/10000
Epoch 74/10000
Epoch 75/10000
Epoch 76/10000
Epoch 77/10000
Epoch 78/10000
Epoch 79/10000
Epoch 80/10000
Epoch 81/10000
Epoch 82/10000
Epoch 83/10000
Epoch 84/10000
Epoch 85/10000
Epoch 86/10000
Epoch 87/10000
Epoch 88/10000
Epoch 89/10000
Epoch 90/10000
Epoch 91/10000
Epoch 92/10000
Epoch 93/10000
Epoch 94/10000
Epoch 95/10000
Epoch 96/10000
Epoch 97/10000
Epoch 98/10000
Epoch 99/1

Epoch 108/10000
Epoch 109/10000
Epoch 110/10000
Epoch 111/10000
Epoch 112/10000
Epoch 113/10000
Epoch 114/10000
Epoch 115/10000
Epoch 116/10000
Epoch 117/10000
Epoch 118/10000
Epoch 119/10000
Epoch 120/10000
Epoch 121/10000
Epoch 122/10000
Epoch 123/10000
Epoch 124/10000
Epoch 125/10000
Epoch 126/10000
Epoch 127/10000
Epoch 128/10000
Epoch 129/10000
Epoch 130/10000
Epoch 131/10000
Epoch 132/10000
Epoch 133/10000
Epoch 134/10000
Epoch 135/10000
Epoch 136/10000
Epoch 137/10000
Epoch 138/10000
Epoch 139/10000
Epoch 140/10000
Epoch 141/10000
Epoch 142/10000
Epoch 143/10000
Epoch 144/10000
Epoch 145/10000
Epoch 146/10000


In [69]:
# SCALE THE DATA
for key in X_test:
    transformer = RobustScaler().fit(X_test[key])
    X_test[key] = transformer.transform(X_test[key])

In [71]:
# SELECTED THE IMPORTANTANT FEATURES
for key in X_test:
    X_test_reduced_dim[key] = pd.DataFrame(X_test[key], columns=colnames)
    X_test_reduced_dim[key] = X_test_reduced_dim[key][feature_list]

In [72]:
predictions = {}
for key in X_reduced_dim:
    model.fit(x=X_reduced_dim[key].drop(["y"], axis=1).as_matrix(), 
              y=X_reduced_dim[key]["y"].values / 100.0, epochs=150, 
              verbose=1, 
              validation_split=0.1, 
              shuffle=True, 
              steps_per_epoch=100, initial_epoch=0, validation_steps=10)
    # calculate predictions
    
    predictions[key] = model.predict(X_test_reduced_dim[key].as_matrix()) * 100.0
    
     

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 1088 samples, validate on 121 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150


Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
Train on 1090 samples, validate on 122 

Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150

Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
Train on 1087 samples, validate on 121 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
E

Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150

Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
Train on 1084 samples, validate on 121 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
E

Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 1

Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


# Save predictions

In [89]:
for key in predictions:
    submission =  pd.DataFrame(data={"id": list(range(776)), "y": np.squeeze(list(predictions[key]))}) 
    submission.to_csv("submissions/{}_NN.csv".format(key), index= False)

In [73]:
X_subset = train_subset.drop(["id", "y"], axis=1).as_matrix()
Y_subset = train_subset["y"].values
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
rf.fit(X_subset,Y_subset)

  """Entry point for launching an IPython kernel.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50building tree 44 of 5

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=None, verbose=3, warm_start=False)

In [86]:
# load the test set
test_set =  pd.read_csv(TEST_FILE_PATH)
test_set = target.fillna(train_mean_values)
test_set_sub = test_set[feature_list]

y_pred = rf.predict(test_set_sub.as_matrix())
test_set["y"] = y_pred



  
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


In [88]:
# create submission
test_set[["id", "y"]].to_csv("submissions/first.csv", index= False)