# Xgboost QuickStart

Reimplemented from this [blog post](http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+AnalyticsVidhya+%28Analytics+Vidhya%29).

In [45]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import RandomizedSearchCV

In [2]:
%matplotlib inline
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

## Data Preprocessing

Load the training and testing data and combine them together for preprocessing.

In [3]:
train = pd.read_csv('Dataset/Train.csv')
test  = pd.read_csv('Dataset/Test.csv')
train['source'] = 'train' # add another column to differentiate training and testing 
test['source']  = 'test'
data = pd.concat( [train, test], ignore_index = True )
data.shape

(124737, 27)

Look at categories for *object* variables:

In [6]:
# Check for missing values
# data.isnull().sum()

# looping through all of them
# var = data.columns[ data.dtypes == 'object' ]
# for v in var:
#    print '\nFrequency count for variable %s' % v
#    print data[v].value_counts()
train['Disbursed'].value_counts() / train.shape[0]

0    0.985371
1    0.014629
Name: Disbursed, dtype: float64

Handling individual variables.

1. `City`, `EmployerName`, `Salary_Account`, `LoggedIn` variables dropped because of too many distinct levels.
2. `Lead_Creation_Date` dropped because it doesn't appear to affect much intuitively.
2. `DOB` date of birth is dropped after converted to `Age`.
3. "*_Missing" named column created ( the star represents the original column name) and takes the value of 1 if the original column is missing and 0 if not. And the original column is then removed. `EMI_Loan_Submitted`, `Interest_Rate`, `Loan_Amount_Submitted`, `Loan_Tenure_Submitted`, `Processing_Fee`.
5. `Existing_EMI`, `Loan_Amount_Applied`, `Loan_Tenure_Applied` imputed with 0 (median) since very few observations are missing.
6. `Source`. The top 2 most frequent level kept as it is and all others combined into one single category "others".
7. One-Hot-Encode the categorical variables.

In [7]:
def data_preprocessing(data):
    """
    Pass in data ( train and test data concated together ),
    performs multiple preprocessing steps and returns the cleaned
    training and testing data 
    """
    # 1. drop because of too many unique levels
    # print len( data['City'].unique() )
    too_many_levels = [ 'City', 'Employer_Name', 'Salary_Account', 'LoggedIn' ]
    data.drop( too_many_levels, axis = 1, inplace = True )

    # 2. drop this variable because doesn't appear to affect much intuitively
    remove = [ 'ID', 'Lead_Creation_Date' ]
    data.drop( remove, axis = 1, inplace = True )

    # 3. Determine Age from DOB ( date of birth )
    # extract the last two element of the string (year)
    data['Age'] = data['DOB'].apply( lambda x: 115 - int(x[-2:]) )
    data.drop( 'DOB', axis = 1, inplace = True )

    # 4. record whether the value is missing or not ; drop original vaiables
    missing_or_not = [ 'EMI_Loan_Submitted', 'Loan_Amount_Submitted', 
                       'Loan_Tenure_Submitted', 'Interest_Rate', 'Processing_Fee' ]
    for col in missing_or_not:
        column_name = col + '_Missing'
        data[column_name] = data[col].apply( lambda x: 1 if pd.isnull(x) else 0 )
    data.drop( missing_or_not, axis = 1, inplace = True )

    # 5. impute by median, because just 111 observations are missing
    impute_by_median = [ 'Existing_EMI', 'Loan_Amount_Applied', 'Loan_Tenure_Applied' ]
    for col in impute_by_median:
        data[col].fillna( data[col].median(), inplace = True )

    # 6. categorize all the other levels as 'others' except for 'S122' and 'S133'
    data['Source'] = data['Source'].apply( lambda x: 'others' if x not in ['S122','S133'] else x )
    # data['Source'].value_counts()

    # 7. one hot encode, exclude the last column 'source', which just keeps track of the train / test
    object_cols = data.columns[ data.dtypes == 'object'][:-1]
    data = pd.get_dummies( data, columns = object_cols  )

    train = data[ data['source'] == 'train' ].copy()
    test  = data[ data['source'] == 'test' ].copy()
    train.drop( 'source', axis = 1, inplace = True )
    test.drop( [ 'source', 'Disbursed' ], axis = 1, inplace = True )
    return train, test

In [8]:
train, test = data_preprocessing(data)
# set the input and output columns
target = 'Disbursed'
predictors = [ x for x in train.columns if x not in [target] ]

## XGBClassifier

xgboost parameter [API](http://xgboost.readthedocs.org/en/latest/parameter.html#general-parameters).

Initial try with some parameters.

Define the classifiers and the parameters' dictionary that does the random searching.

In [39]:
xgb1 = XGBClassifier(
    learning_rate = 0.005,
    n_estimators = 10, # number of trees to fit 
    gamma = 0, # the minimum loss reduction to split the tree, the larger, the more conservative
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = 'binary:logistic',
    nthread = -1,
    seed = 27
)

params_dict = {
    'max_depth': [ 4, 7, 10 ],
    'min_child_weight': [ 10, 15, 23 ] # the minimum number of node of a leaf node
}

rs = RandomizedSearchCV(
    estimator = xgb1,
    param_distributions = params_dict,
    scoring = 'roc_auc',
    cv = 10,
    n_iter = 5 
)

In [40]:
rs.fit( train[predictors].values, train[target].values )

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.005, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8),
          fit_params={}, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'max_depth': [4, 7, 10], 'min_child_weight': [10, 15, 23]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring='roc_auc', verbose=0)

In [49]:
rs.best_params_

{'max_depth': 10, 'min_child_weight': 23}

In [47]:
pred = rs.predict(train[predictors])
prob = rs.predict_proba( train[predictors])[ :, 1 ]
        
#Print model report:
print "\nModel Report"
print "Accuracy : %.4f" % accuracy_score( train[target].values, pred )
print "AUC Score (Train): %f" % roc_auc_score( train[target].values, prob )


Model Report
Accuracy : 0.9854
AUC Score (Train): 0.815490


In [54]:
rs.best_estimator_.fit( train[predictors].values, train[target].values )

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.005, max_delta_step=0, max_depth=10,
       min_child_weight=23, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8)