In [None]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
import operator

import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid', context='notebook')

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn import metrics
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
# Load data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [None]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
train_df.drop(['Id'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head(3)

In [None]:
# Find columns which have null values, Usually it can be done by train_df.isnull().sum()
for col in train_df.columns:
    if train_df[col].isnull().sum() != 0:
        print(col, train_df[col].isnull().sum())
        #print(train_df[col].value_counts())

In [None]:
test_df.shape

In [None]:
# Find columns which have null values, Usually it can be done by test_df.isnull().sum()
for col in test_df.columns:
    if test_df[col].isnull().sum() != 0:
        print(col, test_df[col].isnull().sum())
        #print(test_df[col].value_counts())

In [None]:
# fill NaN values. We don't need to do this, xgboost does it automatically
#train_df.fillna(-1, inplace=True)
#test_df.fillna(-1, inplace=True)

In [None]:
#train_df[pd.isnull(train_df['PersonalField7'])].head(3)

In [None]:
# scatter plot matrix
#sns.pairplot(yelp, kind='reg')

# limit scatter plot matrix and add regression lines
#sns.pairplot(yelp, x_vars=['cool', 'useful', 'funny'], y_vars='stars', size=6, aspect=0.7, kind='reg')

In [None]:
# How many customers purchased insurance plan
#sns.countplot(x="QuoteConversion_Flag", data=train_df)

In [None]:
#f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))

# how many customers bought or did not buy policy
#sns.countplot(x='QuoteConversion_Flag', data=train_df, hue='Year', ax=ax1)

# which year has highest number of policies bought
#sns.countplot(x=train_df['Year'].loc[train_df['QuoteConversion_Flag'] == 1], order=[2013,2014,2015], ax=ax2)

In [None]:
# which month has highest number of policies bought
#sns.countplot(x=train_df['Month'].loc[train_df['QuoteConversion_Flag'] == 1], order=[1,2,3,4,5,6,7,8,9,10,11,12])

In [None]:
# There are some columns with non-numerical values(i.e. dtype='object'),
# So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set.

from sklearn.preprocessing import LabelEncoder

for f in train_df.columns:
    if train_df[f].dtype=='object':
        #print(f)
        lbl_encoder = LabelEncoder()
        lbl_encoder.fit(np.unique(list(train_df[f].values) + list(test_df[f].values)))
        train_df[f] = lbl_encoder.transform(list(train_df[f].values))
        test_df[f] = lbl_encoder.transform(list(test_df[f].values))

In [None]:
# define training and testing sets
lbl_encoder = LabelEncoder()
lbl_encoder.fit(np.unique(list(train_df['Response'].values)))
train_df['Response'] = lbl_encoder.transform(list(train_df['Response'].values))

y_train = train_df['Response']
X_train = train_df.drop('Response', axis=1)
X_test  = test_df.copy()
X_test = X_test[X_train.columns.tolist()] # maintain same column order between train and test data

In [None]:
print(X_train.columns)
print(X_test.columns)

In [None]:
from sklearn import tree
from sklearn.cross_validation import train_test_split

#train_df_non_null = train_df.dropna()
traning_labels = train_df['Response']
training_data = train_df.drop('Response', axis=1)
print(training_data.shape)

#testing_data = test_df.dropna()
testing_data = test_df[training_data.columns.tolist()] # maintain same column order between train and test data
print(testing_data.shape)

In [None]:
from sklearn.feature_selection import RFECV
dtree = tree.DecisionTreeClassifier(criterion = "gini", max_depth =5, min_samples_leaf = 20)
selector = RFECV(dtree, step=1, cv=5)
selector = selector.fit(training_data, traning_labels)
selector.support_

In [None]:
training_data, validation_data, traning_labels, validation_labels = train_test_split(training_data, traning_labels, test_size=0.25)

dtree = tree.DecisionTreeClassifier(criterion = "gini", max_depth =5, min_samples_leaf = 20)
dtree = dtree.fit(training_data, traning_labels)
print(pd.DataFrame(dtree.feature_importances_, columns = ["Imp"], index = training_data.columns).sort_values(by=['Imp'], ascending = False))

In [None]:
def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()

In [None]:
features = list(X_train.columns)
ceate_feature_map(features)

In [None]:
# Calculate Feature Importance of the attributes
xgb_params = {"n_estimators":25, "objective": "multi:softmax", "num_class":8, "eta": 0.025, "max_depth": 10, "silent": 1, "eval_metric": "auc"}
num_rounds = 10

dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)

importance = gbdt.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

In [None]:
plt.figure()
df.plot()

In [None]:
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(15, 30))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')

In [None]:
# Low Importance Features
low_importance_features = []
low_importance_features.append(df[0:24]['feature'].tolist())
low_importance_features.append(df[0:41]['feature'].tolist())
low_importance_features.append(df[0:61]['feature'].tolist())
#low_importance_features

In [None]:
skf = StratifiedKFold(y_train, n_folds=5)
for low_importance_feature_set in low_importance_features:
    # Drop unwanted Features
    X_train_reduced = X_train.drop(low_importance_feature_set, axis=1)
    X_test_reduced = X_test.drop(low_importance_feature_set, axis=1)
    
    # 5 Fold Cross Validation with reduced features
    xgb_clf = xgb.XGBClassifier(n_estimators=25,
                            objective="multi:softmax",
                            max_depth=10,
                            learning_rate=0.025,
                            silent=True,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            missing=np.nan)


    scores = []    
    for train_index, test_index in skf:
        xgb_clf.fit(X_train.iloc[train_index], y_train.iloc[train_index], eval_metric="auc")
        y_pred = xgb_clf.predict(X_train.iloc[test_index])
        scores.append(metrics.accuracy_score(y_train.iloc[test_index], y_pred))

    train_acc = np.array(scores).mean() * 100
    print('XGBoostClassifier Cross Validation Accuracy With %s Reduced Features: %.2f%%' % (len(low_importance_feature_set), train_acc))

In [None]:
# GridSearchCV with XGBoost 
xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                            nthread=-1,
                            learning_rate=0.025,
                            silent=True,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            missing=np.nan)

In [None]:
from sklearn.grid_search import GridSearchCV
X_train_reduced = X_train.drop(low_importance_feature_set[0], axis=1)
param_grid = {'max_depth': [6,8,],
              'n_estimators': [200,500]}

gs = GridSearchCV(xgb_clf,
                  param_grid,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=1,
                  verbose=1)

gs.fit(X_train_reduced, y_train)
gs.best_score_, gs.best_params_
print(gs.best_estimator_)

In [None]:
#clf = gs.best_estimator_
#clf.fit(X_train, y_train)
#y_pred_proba = clf.predict_proba(X_test)[:,1]

In [None]:
# Create submission
#sample = pd.read_csv('data/sample_submission.csv')
#sample.QuoteConversion_Flag = y_pred_proba
#sample.head(10)

In [None]:
#sample.to_csv('xgb_benchmark.csv', index=False)