In [1]:
# Iris Data

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
import sklearn.linear_model as lm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

# Data Files

# load data
df_train_1 = pd.read_csv('./data/kaggle.X1.train.txt', header=None)
df_train_2 = pd.read_csv('./data/kaggle.X2.train.txt', header=None)
df_train_Y = pd.read_csv('./data/kaggle.Y.train.txt', header=None)
df_test_1 = pd.read_csv('./data/kaggle.X1.test.txt', header=None)
df_test_2 = pd.read_csv('./data/kaggle.X2.test.txt', header=None)

# Combine Train Data 
df_train_X = pd.concat([df_train_1, df_train_2], axis=1)

# Combine Test Data 
df_test_X = pd.concat([df_test_1, df_test_2], axis=1)


# Create Numpy Arrays
X_train = df_train_X.values
y_train = df_train_Y.values
X_test = df_test_X.values

# Use Only X1
# X_train = df_train_1.values
# X_test = df_test_1.values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)


(60000, 532)
(60000, 1)
(40000, 532)


In [None]:
# Train Model

depth = 15
estimator = 150
l_rate = 0.03
sub_sample = 0.94
col_sample = 0.85
seed_val = 4242
alpha_val = 1.0
lambda_val = 1.0
min_child_val = 15

# Split For Validation
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1)

# Split for Train Test
X_fit, X_eval, y_fit, y_eval= train_test_split(X_tr, y_tr, test_size=0.2)

# classifier XBG Regression
clf = xgb.XGBRegressor(missing=np.nan, 
                       max_depth=depth,
                       n_estimators=estimator, 
                       learning_rate=l_rate,
                       nthread=8, 
                       subsample=sub_sample,
                       colsample_bytree=col_sample, 
                       seed=seed_val, 
                       reg_alpha = alpha_val,
                       reg_lambda = lambda_val,
                       min_child_weight=min_child_val
                      )



# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="rmse", eval_set=[(X_eval, y_eval)])

# predicting
y_pred= clf.predict(X_test)

print("Done!")

Will train until validation_0 error hasn't decreased in 20 rounds.
[0]	validation_0-rmse:0.833439
[1]	validation_0-rmse:0.822915
[2]	validation_0-rmse:0.812914
[3]	validation_0-rmse:0.802847
[4]	validation_0-rmse:0.793822
[5]	validation_0-rmse:0.784951
[6]	validation_0-rmse:0.776040
[7]	validation_0-rmse:0.767773
[8]	validation_0-rmse:0.759897
[9]	validation_0-rmse:0.752396
[10]	validation_0-rmse:0.745540
[11]	validation_0-rmse:0.738729
[12]	validation_0-rmse:0.732228
[13]	validation_0-rmse:0.726133
[14]	validation_0-rmse:0.720053
[15]	validation_0-rmse:0.714523
[16]	validation_0-rmse:0.709065
[17]	validation_0-rmse:0.703969
[18]	validation_0-rmse:0.698853
[19]	validation_0-rmse:0.694282
[20]	validation_0-rmse:0.689765
[21]	validation_0-rmse:0.685284
[22]	validation_0-rmse:0.681479
[23]	validation_0-rmse:0.677577
[24]	validation_0-rmse:0.673950
[25]	validation_0-rmse:0.670418
[26]	validation_0-rmse:0.666822
[27]	validation_0-rmse:0.663403
[28]	validation_0-rmse:0.660131
[29]	validation

In [10]:
# Score

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

# Print Parameters
print("Parameters")
print("X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1")
print()
print("Depth: ", depth)
print("estimator: ", estimator)
print("l_rate: ", l_rate)
print("sub_sample: ", sub_sample)
print("col_sample: ", col_sample)
print("seed: ", seed_val)
print("Alpha: ", alpha_val)
print("Lambda: ", lambda_val)
print("Min Child: ",min_child_val)
print()
print()


# Score On Full Train Set
full_train_pred = clf.predict(X_train)
print(" -- Scores -- Full Train -- ")
print("Explained Variance Score: ", explained_variance_score(y_train, full_train_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_train, full_train_pred))
print("Mean Squared Error: ", mean_squared_error(y_train, full_train_pred))
print("Median Absolute Error: ", median_absolute_error(y_train, full_train_pred))
print("R2 Score: ", r2_score(y_train, full_train_pred))
print()
print()

# Score On Train Split
split_train_pred = clf.predict(X_fit)
print(" -- Scores -- Train Split -- ")
print("Explained Variance Score: ", explained_variance_score(y_fit, split_train_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_fit, split_train_pred))
print("Mean Squared Error: ", mean_squared_error(y_fit, split_train_pred))
print("Median Absolute Error: ", median_absolute_error(y_fit, split_train_pred))
print("R2 Score: ", r2_score(y_fit, split_train_pred))
print()
print()

# Score On Test Split
split_test_pred = clf.predict(X_eval)
print(" -- Scores -- Test Split -- ")
print("Explained Variance Score: ", explained_variance_score(y_eval, split_test_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_eval, split_test_pred))
print("Mean Squared Error: ", mean_squared_error(y_eval, split_test_pred))
print("Median Absolute Error: ", median_absolute_error(y_eval, split_test_pred))
print("R2 Score: ", r2_score(y_eval, split_test_pred))
print()

# Score On Validation Data
val_pred = clf.predict(X_val)
print(" -- Scores -- Validation -- ")
print("Explained Variance Score: ", explained_variance_score(y_val, val_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_val, val_pred))
print("Mean Squared Error: ", mean_squared_error(y_val, val_pred))
print("Median Absolute Error: ", median_absolute_error(y_val, val_pred))
print("R2 Score: ", r2_score(y_val, val_pred))
print()



Parameters
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1

Depth:  15
estimator:  150
l_rate:  0.03
sub_sample:  0.94
col_sample:  0.85
seed:  4242
Alpha:  1.0
Lambda:  1.0
Min Child:  20


 -- Scores -- Full Train -- 
Explained Variance Score:  0.801031481762
Mean Absolute Error:  0.215141688128
Mean Squared Error:  0.138554203811
Median Absolute Error:  0.11531512912
R2 Score:  0.801021244244


 -- Scores -- Train Split -- 
Explained Variance Score:  0.91575278827
Mean Absolute Error:  0.155976291801
Mean Squared Error:  0.0581680960815
Median Absolute Error:  0.0985507144112
R2 Score:  0.915742899699


 -- Scores -- Test Split -- 
Explained Variance Score:  0.512851195585
Mean Absolute Error:  0.365787057814
Mean Squared Error:  0.344478395748
Median Absolute Error:  0.203173100564
R2 Score:  0.512846975055

 -- Scores -- Validation -- 
Explained Variance Score:  0.518281667462
Mean Absolute Error:  0.369970876249
Mean Squared Error:  0.346670633978
Media

In [None]:
# Score on Test File From 9th Place Prediction


In [45]:
# Write To File
import csv

write_file = open('predictions_1.csv', 'w')
write_file.write('ID,Prediction\n')
for x,y in enumerate(y_pred):
    write_file.write('{},{} \n'.format(x+1, y))
    
                     
write_file.close()

print(y_pred.shape)

print('Completed!')

(40000,)
Completed!
