In [62]:
from pandas import read_csv
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import matplotlib.pyplot as plt
import numpy as np
import data
import lightgbm as lgb
from sklearn.metrics import log_loss # positive value

In [70]:
dataset = data.Dataset()
X_train = dataset.getTrain().values
X_test = dataset.getTest().values

y_train_data = read_csv('../y_train.csv')
y_train = y_train_data.values[:,1]
y_train[y_train == -1] = 0

In [71]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=441)

In [72]:
xgb_model = xgb.XGBClassifier(
                objective="multi:softmax", 
                random_state=441, 
                # max_depth=8, 
                # max_leaves=66, 
                # learning_rate=0.1, 
                # subsample=0.8, 
                # min_child_weight = 8,
                # colsample_bytree = 0.6,
                # gamma = 0.1,
                # n_estimators = 200
            ).fit(X_train, y_train) 

In [73]:
# Using default num_leaves and learning_rate:
    # -0.846532 using {'feature_fraction': 0.26, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 175, 'num_leaves': 31, 'subsample': 0.5}
lgbm_model = lgb.LGBMClassifier(objective='multiclass', 
                                num_class=5, boosting_type='gbdt', seed=441,
                                #     feature_fraction=0.26,
                                #     num_leaves=27, # Close to default
                                #     learning_rate=0.084, # Close to default
                                #     n_estimators=175,
                                #     max_depth=6,
                                #     subsample=0.5,
                                verbose=-1, force_col_wise=True
                                
                                # feature_fraction=1.0, learning_rate=0.1, max_depth= -1 (i.e. infty), n_estimators=100, num_leaves=31, subsample=1.0
                         ).fit(X_train, y_train)

In [74]:
# Generate predicted probabilities from both models
xgb_probs = xgb_model.predict_proba(X_val)
lgb_probs = lgbm_model.predict_proba(X_val)

In [75]:
stacked = np.concatenate((xgb_probs, lgb_probs), axis=1)

In [9]:
#Test logistic meta-model
lmodel = LogisticRegressionCV(random_state=441, cv=5, max_iter=1000, scoring="neg_log_loss")
lmodel.fit(stacked,y_val)

In [10]:
print(lmodel.score(stacked, y_val))

-0.8557052567723323


In [23]:
# Random forest meta-model
# -1.117774 
rfmodel = RandomForestClassifier(random_state=441)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=441)
param_grid = { 
    'n_estimators': [100],
    'max_features': ['sqrt'],
    'criterion' :['log_loss']
}
grid_search = GridSearchCV(estimator=rfmodel, param_grid=param_grid, scoring="neg_log_loss", cv=kfold)
grid_result = grid_search.fit(stacked, y_val)
print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: -1.117774 using {'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 100}


In [27]:
# Extra randomized trees meta-model
# -1.179473
rand_trees_model = ExtraTreesClassifier()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=441)
param_grid = { 
    'n_estimators': [100],
    'max_features': ['sqrt'],
    'criterion' :['log_loss']
}
grid_search = GridSearchCV(estimator=rand_trees_model, param_grid=param_grid, scoring="neg_log_loss", cv=kfold)
grid_result = grid_search.fit(stacked, y_val)
print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: -1.179473 using {'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 100}


In [60]:
# LightGBM meta-model
# data_old: -0.854490
# data: -0.855195
lgb_meta_model = lgb.LGBMClassifier(objective='multiclass', 
                                num_class=5, boosting_type='gbdt', seed=441,
                                #     feature_fraction=0.26,
                                #     num_leaves=27, # Close to default
                                #     learning_rate=0.084, # Close to default
                                #     n_estimators=175,
                                #     max_depth=6,
                                #     subsample=0.5,
                                verbose=-1, force_col_wise=True)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=441)
param_grid = { 
    'feature_fraction': [0.26],
    'num_leaves': [6],
    'learning_rate': [0.084], #
    'n_estimators': [50],
    'max_depth': [2],
    'subsample': [0.01]
}
grid_search = GridSearchCV(estimator=lgb_meta_model, param_grid=param_grid, scoring="neg_log_loss", cv=kfold)
grid_result = grid_search.fit(stacked, y_val)
print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: -0.854490 using {'feature_fraction': 0.26, 'learning_rate': 0.084, 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 6, 'subsample': 0.01}


In [76]:
#Test xgb meta-model
stack_model = xgb.XGBClassifier(
                objective="multi:softmax", 
                random_state=441, 
                # max_depth=8, 
                # max_leaves=66, 
                # learning_rate=0.1, 
                # subsample=0.8, 
                # min_child_weight = 8,
                # colsample_bytree = 0.6,
                # gamma = 0.1
)
#0.2 train/val
#Best: -0.849004 using {'colsample_bytree': 0.7, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 2, 'max_leaves': 3, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}

# data_old: -0.845424
# data:     -0.847389 {'colsample_bytree': 0.7}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=441)
param_grid = {
    'colsample_bytree': [0.7], 
    'gamma': [0.1],
    'learning_rate': [0.1],
    'max_depth': [2],
    'max_leaves': [3],
    'min_child_weight': [4],
    'subsample': [0.8],
    'n_estimators': [105]
}

# param_grid = {
#     'colsample_bytree': [0.8], 
#     'gamma': [0.1], 
#     'learning_rate': [0.1], 
#     'max_depth': [2], 
#     'max_leaves': [3], 
#     'min_child_weight': [4], 
#     'subsample': [0.8],
#     'n_estimators': [105]
# }

# param_grid = {
#     'max_depth': [2,3,4],
#     'max_leaves': [2,3,4,5],
#     'learning_rate': [0.07, 0.1, 0.13],
#     'min_child_weight': [2,3,4],
#     'subsample': [0.8, 0.9, 1],
#     'colsample_bytree': [0.7, 0.8], 
#     'gamma': [0, 0.1, 0.2],
#     'n_estimators': [100]
# }

# param_grid = {
#     'max_depth': [2,3],
#     'max_leaves': [2,3],
#     'learning_rate': [0.1, 0.13, 0.15],
#     'min_child_weight': [4],
#     'subsample': [0.8],
#     'colsample_bytree': [0.7, 0.8],
#     'gamma': [0, 0.1, 0.2],
#     'n_estimators': [95, 100, 105]
# }

# param_grid = {
#     'colsample_bytree': [0.7], 
#     'gamma': [0], 
#     'learning_rate': [0.13], 
#     'max_depth': [2], 
#     'max_leaves': [2], 
#     'min_child_weight': [4], 
#     'subsample': [0.8],
#     'n_estimators': [100]
# }

grid_search = GridSearchCV(stack_model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(stacked, y_val)

print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# print results
# means = grid_result.cv_results_['mean_test_score']
# params = grid_result.cv_results_['params']
# for mean, param in zip(means, params):
#  print("%f with: %r" % (mean, param))




Best: -0.847389 using {'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 2, 'max_leaves': 3, 'min_child_weight': 4, 'n_estimators': 105, 'subsample': 0.8}


In [77]:
# Best xgb meta-model
stacked_model = xgb.XGBClassifier(
                objective="multi:softmax", 
                random_state=441, 
                max_depth=2, 
                max_leaves=3, 
                learning_rate=0.1, 
                subsample=0.8, 
                min_child_weight = 4,
                colsample_bytree = 0.7,
                gamma = 0.1,
                n_estimators = 105
).fit(stacked, y_val)

In [78]:
xgb_test = xgb_model.predict_proba(X_test)
lgb_test = lgbm_model.predict_proba(X_test)
stacked_test = np.concatenate((xgb_test, lgb_test), axis=1)

In [79]:
prob = stacked_model.predict_proba(stacked_test)

In [82]:
test = np.c_[ np.ones(11438), prob ]              # add a column

for i in range(len(test)):
    test[i][0] = i

header = "id,no answer,very important,quite important,not important,not at all important"
fmt = '%d', '%1.9f', '%1.9f', '%1.9f', '%1.9f', '%1.9f'
np.savetxt('stack_default_new_data.csv', test, delimiter=',', header=header, comments="", fmt=fmt) 