In [1]:
# Iris Data

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
import sklearn.linear_model as lm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

# Data Files

# load data
df_train_1 = pd.read_csv('./data/kaggle.X1.train.txt', header=None)
df_train_2 = pd.read_csv('./data/kaggle.X2.train.txt', header=None)
df_train_Y = pd.read_csv('./data/kaggle.Y.train.txt', header=None)
df_test_1 = pd.read_csv('./data/kaggle.X1.test.txt', header=None)
df_test_2 = pd.read_csv('./data/kaggle.X2.test.txt', header=None)

# Combine Train Data 
df_train_X = pd.concat([df_train_1, df_train_2], axis=1)

# Combine Test Data 
df_test_X = pd.concat([df_test_1, df_test_2], axis=1)


# Create Numpy Arrays
X_train = df_train_X.values
y_train = df_train_Y.values
X_test = df_test_X.values

# Use Only X1
# X_train = df_train_1.values
# X_test = df_test_1.values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)


(60000, 532)
(60000, 1)
(40000, 532)


In [None]:
# Train Model

depth = 20
estimator = 300
l_rate = 0.03
sub_sample = 0.94
col_sample = 0.85
seed_val = 4242
alpha_val = 0.7
lambda_val = 1.0
min_child_val = 20

# classifier XBG Regression
clf = xgb.XGBRegressor(missing=np.nan, 
                       max_depth=depth,
                       n_estimators=estimator, 
                       learning_rate=l_rate,
                       nthread=-1, 
                       subsample=sub_sample,
                       colsample_bytree=col_sample, 
                       seed=seed_val, 
                       reg_alpha = alpha_val,
                       reg_lambda = lambda_val,
                       min_child_weight=min_child_val
                      )

X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.2)

# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="rmse", eval_set=[(X_eval, y_eval)])

# predicting
y_pred= clf.predict(X_test)

print("Done!")

In [8]:
# Score

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

# Print Parameters
print("Parameters")
print()
print("Depth: ", depth)
print("estimator: ", estimator)
print("l_rate: ", l_rate)
print("sub_sample: ", sub_sample)
print("col_sample: ", col_sample)
print("seed: ", seed_val)
print("Alpha: ", alpha_val)
print("Lambda: ", lambda_val)
print("Min Child: ",min_child_val)
# print("Num Rounds: ", n_round)
# print("Verbose Val: ", verbose_val)
print()
print()


# Score On Full Train Set
full_train_pred = clf.predict(X_train)
print(" -- Scores -- Full Train -- ")
print("Explained Variance Score: ", explained_variance_score(y_train, full_train_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_train, full_train_pred))
print("Mean Squared Error: ", mean_squared_error(y_train, full_train_pred))
print("Median Absolute Error: ", median_absolute_error(y_train, full_train_pred))
print("R2 Score: ", r2_score(y_train, full_train_pred))
print()
print()

# Score On Train Split
split_train_pred = clf.predict(X_fit)
print(" -- Scores -- Train Split -- ")
print("Explained Variance Score: ", explained_variance_score(y_fit, split_train_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_fit, split_train_pred))
print("Mean Squared Error: ", mean_squared_error(y_fit, split_train_pred))
print("Median Absolute Error: ", median_absolute_error(y_fit, split_train_pred))
print("R2 Score: ", r2_score(y_fit, split_train_pred))
print()
print()

# Score On Test Split
split_test_pred = clf.predict(X_eval)
print(" -- Scores -- Test Split -- ")
print("Explained Variance Score: ", explained_variance_score(y_eval, split_test_pred ))
print("Mean Absolute Error: ", mean_absolute_error(y_eval, split_test_pred))
print("Mean Squared Error: ", mean_squared_error(y_eval, split_test_pred))
print("Median Absolute Error: ", median_absolute_error(y_eval, split_test_pred))
print("R2 Score: ", r2_score(y_eval, split_test_pred))
print()



Parameters

Depth:  20
estimator:  300
l_rate:  0.03
sub_sample:  0.94
col_sample:  0.85
seed:  4242
Alpha:  0.7
Lambda:  1.0
Min Child:  20


 -- Scores -- Full Train -- 
Explained Variance Score:  0.888426347944
Mean Absolute Error:  0.131621328481
Mean Squared Error:  0.0776917928183
Median Absolute Error:  0.056496325939
R2 Score:  0.888426219904


 -- Scores -- Train Split -- 
Explained Variance Score:  0.979779140596
Mean Absolute Error:  0.0746439364939
Mean Squared Error:  0.0141191740578
Median Absolute Error:  0.0471304126816
R2 Score:  0.979779131222


 -- Scores -- Test Split -- 
Explained Variance Score:  0.517919776851
Mean Absolute Error:  0.359530896431
Mean Squared Error:  0.33198226786
Median Absolute Error:  0.195869814942
R2 Score:  0.517917791229



In [None]:
# Score on Test File From 9th Place Prediction


In [45]:
# Write To File
import csv

write_file = open('predictions_1.csv', 'w')
write_file.write('ID,Prediction\n')
for x,y in enumerate(y_pred):
    write_file.write('{},{} \n'.format(x+1, y))
    
                     
write_file.close()

print(y_pred.shape)

print('Completed!')

(40000,)
Completed!
