In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

Import dataset

In [2]:
#df3 = pd.read_csv('df3_encoded.csv', low_memory=False)
df3 = pd.read_csv('df_sample.csv', low_memory=False)
#df3 = pd.read_csv('df_binary.csv', low_memory=False)
#df3 = pd.read_csv('df3_arrival.csv', low_memory=False)

In [3]:
df3 = df3.drop(['payer', 'Unnamed: 0'], axis='columns')

In [4]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 6738 entries, age to race_0
dtypes: float64(24), int64(6714)
memory usage: 514.1 MB


In [5]:
binaries = {}
for c in [*df3.columns]:
    if df3[c].min() > -127 and df3[c].max() < 128:
        if df3[c].dtype == 'int64':
            binaries[c] = np.int8

In [6]:
df3 = df3.astype(binaries)

In [7]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 6738 entries, age to race_0
dtypes: float64(24), int8(6714)
memory usage: 65.9 MB


## Split data set into Train and Test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
y = df3.tot.astype(float)
X = df3.drop(columns=['tot'])
#print(y.dtypes)
#print('\n')
#print(X.dtypes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)

## Decision Tree

In [44]:
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

In [45]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predicted = model.predict(X_test)

In [46]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

701412864.39
0.7403596440999712


In [47]:
sorted([model.feature_importances_])

[array([5.88894816e-04, 3.68343781e-05, 5.47142486e-04, ...,
        0.00000000e+00, 8.32837528e-08, 0.00000000e+00])]

## AdaBoost

In [48]:
from sklearn.ensemble import AdaBoostRegressor

In [49]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
predicted = model.predict(X_test)

In [50]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

936004539.9476076
0.6535213934415196


## Boosting

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

In [58]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
model = GradientBoostingRegressor(**params)

In [None]:
model.fit(X_train, y_train)
predicted = model.predict(X_test)

In [53]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

350597429.948665
0.8702201711560102


## XGBoost

In [43]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

ModuleNotFoundError: No module named 'xgboost'

In [11]:
# fit model to training data
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.3, 'loss': 'ls'}
model = XGBRegressor(**params)

In [None]:
xgb_param = model.get_xgb_params()

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=5,
                  early_stopping_rounds=50)
print(cvresult)

In [None]:
model.set_params(n_estimators=cvresult.shape[0])

In [None]:
#Fit the algorithm on the data
model.fit(X_train, y_train)

In [None]:
#Predict training set:
predicted = model.predict(X_test)
test_predprob = alg.predict_proba(X_test)[:,1]

In [None]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

In [None]:
feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)[:10]
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

#### Tuning

In [12]:
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search



In [20]:
param_test1 = {
 'max_depth': [2,4],
 'min_child_weight': [1,2]
}

In [None]:
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.3, n_estimators=100, seed=0), 
 param_grid = param_test1, cv=5)
gsearch1.fit(X_train, y_train)
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)

## XGBoost 3

In [36]:
# fit model to training data
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
model = XGBRegressor(**params)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, loss='ls',
       max_delta_step=0, max_depth=4, min_child_weight=1,
       min_samples_split=2, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [37]:
predicted = model.predict(X_test)

In [38]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

282654451.0070111
0.8953704644125516


## XGBoost 4

In [33]:
# fit model to training data
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
model = XGBRegressor(**params)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, loss='ls',
       max_delta_step=0, max_depth=2, min_child_weight=1,
       min_samples_split=2, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [34]:
predicted = model.predict(X_test)

In [35]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

395094964.3891483
0.8537486231343632


## XGBoost 2

In [33]:
# fit model to training data
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
model = XGBRegressor(**params)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, loss='ls',
       max_delta_step=0, max_depth=2, min_child_weight=1,
       min_samples_split=2, missing=None, n_estimators=500, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [34]:
predicted = model.predict(X_test)

In [35]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

395094964.3891483
0.8537486231343632


## Random Forest

In [54]:
from sklearn.ensemble import RandomForestRegressor

In [55]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [56]:
predicted = model.predict(X_test)

In [57]:
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(mse)
print(r2)

414168184.75642
0.8466883338586242
