In [None]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
import operator

import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid', context='notebook')

import xgboost as xgb

from sklearn import metrics
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score

from time import time

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
# Load data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [None]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
test_df.drop(['QuoteNumber'], axis=1, inplace=True)
train_df.drop(['QuoteNumber'], axis=1, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head(3)

In [None]:
# Convert Date to Year, Month, and Week
train_df['Date'] = pd.to_datetime(pd.Series(train_df['Original_Quote_Date']))
train_df['Year']  = train_df['Date'].apply(lambda x: int(str(x)[:4]))
train_df['Month'] = train_df['Date'].apply(lambda x: int(str(x)[5:7]))
train_df['Weekday']  = train_df['Date'].dt.dayofweek

test_df['Date'] = pd.to_datetime(pd.Series(test_df['Original_Quote_Date']))
test_df['Year']  = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7]))
test_df['Weekday']  = test_df['Date'].dt.dayofweek

train_df.drop(['Original_Quote_Date', 'Date'], axis=1, inplace=True)
test_df.drop(['Original_Quote_Date', 'Date'], axis=1, inplace=True)

In [None]:
train_df.shape

In [None]:
#train_df['Weekday'].head(20)
features = train_df.columns.tolist()
#features
start, end = 1, len(features)
while start < end:
    sub_features = ['QuoteConversion_Flag'] + features[start:min(start + 14, end)]
    plt.figure()
    sns.heatmap(train_df[sub_features].corr())
    start = start + 14

In [None]:
test_df.shape

In [None]:
# There are some columns with non-numerical values(i.e. dtype='object'),
# So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set.

from sklearn import preprocessing

for f in train_df.columns:
    if train_df[f].dtype=='object':
        print(f)
        lbl_encoder = preprocessing.LabelEncoder()
        lbl_encoder.fit(np.unique(list(train_df[f].values) + list(test_df[f].values)))
        train_df[f] = lbl_encoder.transform(list(train_df[f].values))
        test_df[f] = lbl_encoder.transform(list(test_df[f].values))

In [None]:
# define training and testing sets
y_train = train_df['QuoteConversion_Flag']
X_train = train_df.drop('QuoteConversion_Flag', axis=1)
X_test  = test_df.copy()
X_test = X_test[X_train.columns.tolist()] # maintain same column order between train and test data

In [None]:
print(X_train.columns)
print(X_test.columns)

In [None]:
def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()

In [None]:
features = list(X_train.columns)
ceate_feature_map(features)

In [None]:
# Calculate Feature Importance of the attributes
xgb_params = {"n_estimators":25, "objective": "binary:logistic", "eta": 0.025, "max_depth": 10, "silent": 1, "eval_metric": "auc"}
num_rounds = 10

dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)

importance = gbdt.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

In [None]:
plt.figure()
df.plot()

In [None]:
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(15, 30))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')

In [None]:
# Low Importance Features
low_importance_features = []
low_importance_features.append(df[0:21]['feature'].tolist())
low_importance_features.append(df[0:41]['feature'].tolist())
#low_importance_features

In [None]:
for low_importance_feature_set in low_importance_features:
    # Drop unwanted Features
    X_train_reduced = X_train.drop(low_importance_feature_set, axis=1)
    X_test_reduced = X_test.drop(low_importance_feature_set, axis=1)
    
    # 5 Fold Cross Validation with reduced features
    xgb_clf = xgb.XGBClassifier(n_estimators=25,
                            objective="binary:logistic",
                            nthread=-1,
                            max_depth=10,
                            learning_rate=0.025,
                            silent=True,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            missing=np.nan)

    scores = cross_val_score(xgb_clf,
                             X_train_reduced, # training data
                             y_train, # training labels
                             cv=skf,
                             scoring='roc_auc',  # which scoring metric?
                             n_jobs=-1  # -1 = use all cores = faster
                             )
    print('XGBoostClassifier Cross Validation Accuracy With %s Reduced Features: %.2f%%' % (len(low_importance_feature_set), np.array(scores).mean() * 100))

In [None]:
# GridSearchCV with XGBoost 
xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                            nthread=-1,
                            silent=True,
                            missing=np.nan)

In [None]:
from sklearn.grid_search import GridSearchCV
X_train_reduced = X_train.drop(low_importance_feature_set[0], axis=1)
X_test_reduced = X_test.drop(low_importance_feature_set[0], axis=1)
param_grid = {'max_depth': [2,4,6,8,10],
              'n_estimators': [50,100,200,500,1000],
              'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'subsample': [0.9, 1.0],
              'colsample_bytree': [0.8, 1.0]}

t0 = time()
gs = GridSearchCV(xgb_clf,
                  param_grid,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=32,
                  verbose=1)

gs.fit(X_train_reduced, y_train)
print(gs.best_score_)
print(gs.best_params_)
print(gs.best_estimator_)

In [None]:
clf = gs.best_estimator_
clf.fit(X_train_reduced, y_train)
y_pred_proba = clf.predict_proba(X_test_reduced)[:,1]

In [None]:
# Create submission
sample = pd.read_csv('data/sample_submission.csv')
sample.QuoteConversion_Flag = y_pred_proba
#sample.head(10)

In [None]:
sample.to_csv('reduced_xgb_benchmark.csv', index=False)