In [78]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

# Import Data

In [79]:
xtrain = pd.read_csv("X_train.csv")
xtest = pd.read_csv("X_test.csv")
ytrain = pd.read_csv("y_train.csv")
sample = pd.read_csv("sample_submission.csv")

# Helper Functions

In [80]:
def createsub(clf, filename='submission.txt'):
    f = open(filename, 'w')
    f.write("id,actual_wait div 60000\n")
    actual_wait = clf.predict(xtest)
    for i in range(len(xtest)):
        if actual_wait[i] >= 0:
            f.write(str(i) + "," + str(actual_wait[i]) + str('\n'))
        else:
            f.write(str(i) + "," + str(0) + str('\n'))
    f.close()

# Preprocessing

In [81]:
selector = VarianceThreshold()
selector.fit(xtrain)
xtrain = selector.transform(xtrain)
xtest = selector.transform(xtest)

In [84]:
sum = 0
for i in range(len(xtrain)):
    sum += xtrain[i][12]
    
for k in range(len(xtrain)):
    xtrain[i][12] /= sum

In [83]:
imputer = Imputer(missing_values=-1, strategy='median', axis=0)
xtrain = imputer.fit_transform(xtrain)
xtest = imputer.fit_transform(xtest)



In [82]:
for i in range(len(xtrain)):
    xtrain[i][18] /= 60
    xtrain[i][8] /= 60
    
for k in range(len(xtest)):
    xtest[k][8] /= 60
    xtest[k][18] /= 60

In [85]:
sum = 0
for i in range(len(xtest)):
    sum += xtest[i][12]
    
for k in range(len(xtest)):
    xtest[i][12] /= sum

# Gradient Boosting

In [86]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.001, 'loss': 'ls', 'max_features':'sqrt'}


    
clf = ensemble.GradientBoostingRegressor(n_estimators=100, max_depth=2,
                                        min_samples_split=2, loss='ls')
clf.fit(xtrain, ytrain.values.ravel())
mse = mean_squared_error(ytrain, clf.predict(xtrain))
print("MSE: %.4f" % mse)
createsub(clf)

MSE: 1844.4873


# Make Submission

In [9]:
createsub(clf)

# Random Grid

In [4]:
def GradientBooster(param_grid, n_jobs):
    estimator = ensemble.GradientBoostingRegressor()
    cv = ShuffleSplit(xtrain.shape[0], test_size=0.2)
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=param_grid, n_jobs=n_jobs)
    classifier.fit(xtrain, ytrain)
    print("Best Estimator learned through GridSearch") 
    print(classifier.best_estimator_)
    return cv, classifier.best_estimator_

In [76]:
param_grid={'n_estimators':[90, 100, 110],
            'min_samples_split':[2],
            #'min_samples_leaf':[50, 60, 70],
            'max_depth':[2,3,4],
            #'max_features':['sqrt'],
            #'subsample':[0.8],
            'learning_rate':[0.1, 0.01],
            'loss':['ls'],}
n_jobs=3
cv,best_est=GradientBooster(param_grid,n_jobs)

NameError: name 'GradientBooster' is not defined

# Random Forest

In [7]:
rf = ensemble.RandomForestRegressor(n_estimators=500,
                                   max_depth=30,
                                   random_state=2)
rf.fit(xtrain,ytrain.values.ravel())
mse = mean_squared_error(ytrain, rf.predict(xtrain))
print("MSE: %.4f" % mse)

MSE: 159.3721


In [8]:
createsub(rf)

# Elastic Net

In [14]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNetCV

In [10]:
regr = ElasticNet(random_state=0)
regr.fit(xtrain, ytrain)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [11]:
mse = mean_squared_error(ytrain, regr.predict(xtrain))
print("MSE: %.4f" % mse)

MSE: 3420.7921


In [12]:
createsub(regr)

In [16]:
regr2 = ElasticNetCV(cv=5, random_state=0)
regr2.fit(xtrain, ytrain.values.ravel())
mse = mean_squared_error(ytrain, regr.predict(xtrain))
print("MSE: %.4f" % mse)

MSE: 3420.7921
