In [1]:
# Load libraries

In [2]:
# Python libraries

import pandas as pd

import numpy as np

from datetime import datetime

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, learning_curve, train_test_split

from sklearn.metrics import precision_score, roc_auc_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score

#import xgboost as xgb


# import warnings
# warnings.filterwarnings('ignore')

# import plotly.offline as py

# py.init_notebook_mode(connected=True)

# import plotly.graph_objs as go

# import plotly.tools as tls

# import plotly.figure_factory as ff



In [5]:
# Read the data

In [4]:
data = pd.read_csv('Attrition_Final.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1417 entries, 0 to 1416
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1417 non-null   int64  
 1   Attrition          1417 non-null   object 
 2   StackRate          1417 non-null   int64  
 3   Department         1417 non-null   int64  
 4   JobRole            1417 non-null   int64  
 5   DistanceFromHome   1417 non-null   float64
 6   Education          1417 non-null   int64  
 7   Gender             1417 non-null   int64  
 8   JobLevel           1417 non-null   int64  
 9   Technical          1417 non-null   int64  
 10  TotalWorkingYears  1417 non-null   int64  
 11  SpaChange          1417 non-null   int64  
 12  PocChange          1417 non-null   int64  
 13  ShiftChange        1417 non-null   int64  
 14  CareerProgression  1417 non-null   int64  
 15  EmployeeNumber     1417 non-null   int64  
dtypes: float64(1), int64(14)

In [5]:
# Missing values

In [6]:
null_feat = pd.DataFrame(len(data['Attrition']) - data.isnull().sum(), columns = ['Count'])

trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'DeepSkyBlue',
        line=dict(color='#000000',width=1.2)))

layout = dict(title =  "Missing Values")
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [7]:
# Reassign target

In [8]:
data.Attrition.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)

In [9]:
# Exploratory Data Analysis (EDA)

In [10]:
# Head and describe

In [11]:
data.head()

Unnamed: 0,Age,Attrition,StackRate,Department,JobRole,DistanceFromHome,Education,Gender,JobLevel,Technical,TotalWorkingYears,SpaChange,PocChange,ShiftChange,CareerProgression,EmployeeNumber
0,27,0,51,1,1,11.0,1,0,0,1,3,1,1,1,0,17000012
1,23,0,68,1,1,4.0,1,1,0,1,1,1,1,1,0,17000036
2,22,0,68,1,1,25.0,1,0,0,0,0,1,1,1,0,17000045
3,25,0,81,1,1,17.0,1,0,0,1,1,1,1,1,0,17000054
4,23,0,55,1,1,17.0,1,1,0,1,1,1,1,1,0,17000057


In [12]:
data.describe()

Unnamed: 0,Age,Attrition,StackRate,Department,JobRole,DistanceFromHome,Education,Gender,JobLevel,Technical,TotalWorkingYears,SpaChange,PocChange,ShiftChange,CareerProgression,EmployeeNumber
count,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0,1417.0
mean,24.200423,0.271701,68.127029,3.008469,4.525759,17.770783,1.046577,0.525759,0.231475,0.983063,1.169372,0.775582,0.697953,0.927311,0.091743,11442530.0
std,2.257611,0.444993,17.998374,2.679686,4.850397,6.464297,0.454828,0.499512,0.684821,0.129082,0.437758,0.417346,0.459307,0.259717,0.288765,9766481.0
min,18.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,211694.0
25%,23.0,0.0,56.0,1.0,1.0,17.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,797492.0
50%,24.0,0.0,67.0,1.0,3.0,17.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,17007820.0
75%,25.0,1.0,82.0,6.0,7.0,20.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,20018510.0
max,31.0,1.0,105.0,12.0,18.0,60.0,8.0,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,40020280.0


In [13]:
# Target distribution (%)

In [14]:
attrition = data[(data['Attrition'] != 0)]
no_attrition = data[(data['Attrition'] == 0)]


#------------PERCENTAGE-------------------
trace = go.Pie(labels = ['Not_Attrited', 'Attrited'], values = data['Attrition'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['lightskyblue','gold'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of attrition variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [16]:
#  Features distribution and barplot (hue = Attrition)

In [15]:
def plot_distribution(var_select, bin_size) : 
    
# Calculate the correlation coefficient between the new variable and the target

    corr = data['Attrition'].corr(data[var_select])
    
    corr = np.round(corr,3)              # Round an array to the given number of decimals
    tmp1 = attrition[var_select]
    tmp2 = no_attrition[var_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['Attrited', 'Not_Attrited']
    colors = ['#FFD700', '#7EC0EE']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, curve_type='kde', bin_size = bin_size)
    
    fig['layout'].update(title = var_select+' '+'(corr ='+ str(corr)+')')

    py.iplot(fig)

In [16]:
def barplot(var_select, x_no_numeric) :
    tmp1 = data[(data['Attrition'] != 0)]
    tmp2 = data[(data['Attrition'] == 0)]
    tmp3 = pd.DataFrame(pd.crosstab(data[var_select],data['Attrition']), ) # crosstab that counts the number 
                                                                           # of occurences of each combination
    tmp3['Attr%'] = tmp3[1] / (tmp3[1] + tmp3[0]) * 100
    if x_no_numeric == True  : 
        tmp3 = tmp3.sort_values(1, ascending = False)

    color=['lightskyblue','gold' ]
    trace1 = go.Bar(
        x=tmp1[var_select].value_counts().keys().tolist(),
        y=tmp1[var_select].value_counts().values.tolist(),
        name='Attrited',opacity = 0.8, marker=dict(
        color='gold',
        line=dict(color='#000000',width=1)))

    
    trace2 = go.Bar(
        x=tmp2[var_select].value_counts().keys().tolist(),
        y=tmp2[var_select].value_counts().values.tolist(),
        name='Not_Attrited', opacity = 0.8, marker=dict(
        color='lightskyblue',
        line=dict(color='#000000',width=1)))
    
    trace3 =  go.Scatter(   
        x=tmp3.index,
        y=tmp3['Attr%'],
        yaxis = 'y2',
        name='% Attrition', opacity = 0.6, marker=dict(
        color='black',
        line=dict(color='#000000',width=0.5
        )))

    layout = dict(title =  str(var_select),
              xaxis=dict(), 
              yaxis=dict(title= 'Count'), 
              yaxis2=dict(range= [-0, 75], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= '% Attrition'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    py.iplot(fig)


In [17]:
#plot_distribution('StackRate', False)
barplot('StackRate', False)


##plot_distribution('CareerProgression', False)
barplot('CareerProgression', False)


##plot_distribution('Technical', False)
barplot('Technical', False)

In [20]:
def plot_pie(var_select) :
    
    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue', 'lightgrey', 'orange', 'white', 'lightpink']
    trace1 = go.Pie(values  = attrition[var_select].value_counts().values.tolist(),
                    labels  = attrition[var_select].value_counts().keys().tolist(),
                    textfont=dict(size=15), opacity = 0.8,
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "attrition employes",
                    marker  = dict(colors = colors, line = dict(width = 1.5)))
    trace2 = go.Pie(values  = no_attrition[var_select].value_counts().values.tolist(),
                    labels  = no_attrition[var_select].value_counts().keys().tolist(),
                    textfont=dict(size=15), opacity = 0.8,
                    hoverinfo = "label+percent+name",
                    marker  = dict(colors = colors, line = dict(width = 1.5)),
                    domain  = dict(x = [.52,1]),
                    name    = "Non attrition employes" )

    layout = go.Layout(dict(title = var_select + " distribution in employes attrition ",
                            annotations = [dict(text = "Attrited",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .22, y = -0.1),
                                            dict(text = "Not_Attrited",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .8,y = -.1)]))
                                          

    fig  = go.Figure(data = [trace1,trace2],layout = layout)
    py.iplot(fig)

In [21]:
plot_pie("Gender")
barplot('Gender',True)

plot_pie('Age')
barplot('Age', True)

plot_pie('TotalWorkingYears')
barplot('TotalWorkingYears', True)

plot_pie('Department')
barplot('Department',True)

plot_pie('JobLevel')
barplot('JobLevel',True)

plot_pie('JobRole')
barplot('JobRole',True)

plot_pie('TotalWorkingYears')
barplot('TotalWorkingYears',True)

plot_pie('Education')
barplot('Education', True)

plot_pie('SpaChange')
barplot('SpaChange',True)

plot_pie('PocChange')
barplot('PocChange',True)

plot_pie('ShiftChange')
barplot('ShiftChange',True)

In [22]:
#data.SpaChange.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)



#data["SpaChange"]=data["SpaChange"].replace({"Yes":1})
#data["SpaChange"]=data["SpaChange"].replace({"No":0})

In [23]:
#data.PocChange.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)

In [24]:
#data.ShiftChange.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)

In [25]:
# Feature Engineering and selection

# Feature engineering is the process of using domain knowledge to extract new variables 
# from raw data that make machine learning algorithms work.

# Feature Selection

# For any given dataset, many possible features can be chosen. 
# A crucial point to consider is which features to use. 
# There are an infinite number of transformations possible. 
# Even if we restrict ourselves to the space of common transformations for a given type of dataset, 
# we are still often left with thousands of possible features.

In [26]:
def Technical(data) :
    if data[' Technical' ] == 1 and data[' CareerProgression' ] == 0 :
        return 1
    else :
        return 0
    data[' Technical' ] = data. apply(lambda data:Technical(data) ,axis = 1)
    
def Young(data) : 
    if  data['Age'] > 24 and data['CareerProgression'] == 0 :
        return 1
    else : 
        return 0
data['Young'] = data.apply(lambda data:Young(data) ,axis = 1)

In [27]:
barplot('Technical', False)
barplot('Young', False)
# # barplot('ShortDisMale', False)
# # barplot('LongDisMale', False)
# barplot('LongDisJobLevel', False)
# barplot('LongDisCareerProgression', False)
# barplot('YoungNeverPromoted', False)

In [28]:
# Features encoding and scaling :-

# When you’re working with a learning model, it is important to scale the features to a range which is centered around zero. 
# This is done so that the variance of the features are in the same range.

# The aim here is to to achieve Gaussian with zero mean and unit variance. 
# There are many ways of doing this, two most popular are standardisation and normalisation.

# No matter which method you choose, the SciKit Learn library provides a class to easily scale our data. 
# We can use the StandardScaler class from the library for this. 
# Now that we know why we need to scale our features


In [29]:
# #customer id col
# Id_col     = ['EmployeeNumber']
# #Target columns
# target_col = ["Attrition"]
# #categorical columns
# cat_cols   = data.nunique()[data.nunique() < 10].keys().tolist()
# cat_cols   = [x for x in cat_cols if x not in target_col]
# #numerical columns
# num_cols   = [x for x in data.columns if x not in cat_cols + target_col + Id_col]
# #Binary columns with 2 values
# bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# #Columns more than 2 values
# multi_cols = [i for i in cat_cols if i not in bin_cols]

# #Label encoding Binary columns
# le = LabelEncoder()
# for i in bin_cols :
#     data[i] = le.fit_transform(data[i])
    
# #Duplicating columns for multi value columns
# data = pd.get_dummies(data = data,columns = multi_cols )

# #Scaling Numerical columns
# std = StandardScaler()
# scaled = std.fit_transform(data[num_cols])
# scaled = pd.DataFrame(scaled,columns=num_cols)

# #dropping original values merging scaled values for numerical columns
# df_data_og = data.copy()
# data = data.drop(columns = num_cols,axis = 1)
# data = data.merge(scaled,left_index=True,right_index=True,how = "left")
# data = data.drop(['EmployeeNumber'],axis = 1)

In [30]:
# Correlation Matrix

# A correlation matrix is a table showing correlation coefficients between sets of variables. 
# Each random variable (Xi) in the table is correlated with each of the other values in the table (Xj). 
# This allows you to see which pairs have the highest correlation.

In [31]:
#correlation
correlation = data.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale='Viridis',
                   colorbar   = dict() ,
                  )
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                        autosize = False,
                        #height  = 1400,
                        #width   = 1600,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                     ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9)),
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)

In [32]:
# Remove collinear features

In [33]:
# Threshold for removing correlated variables
threshold = 0.8

# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()

# select Upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold

to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove :' % (len(to_drop)))

data = data.drop(columns = to_drop)

to_drop

There are 0 columns to remove :


[]

In [34]:
# Define functions

In [35]:
# Define model performance plot

In [36]:
def model_performance_plot(model) : 
    
    # Confusion matrix
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    trace1 = go.Heatmap(z = conf_matrix  ,x = ["0 (pred)","1 (pred)"],
                        y = ["0 (true)","1 (true)"],xgap = 2, ygap = 2, 
                        colorscale = 'Viridis', showscale  = False)

    # Show metrics
    
    tp = conf_matrix[1,1]
    fn = conf_matrix[1,0]
    fp = conf_matrix[0,1]
    tn = conf_matrix[0,0]
    Accuracy  =  ((tp+tn)/(tp+tn+fp+fn))
    Precision =  (tp/(tp+fp))
    Recall    =  (tp/(tp+fn))
    F1_score  =  (2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))

    show_metrics = pd.DataFrame(data=[[Accuracy , Precision, Recall, F1_score]])
    show_metrics = show_metrics.T

    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
    trace2 = go.Bar(x = (show_metrics[0].values), 
                   y = ['Accuracy', 'Precision', 'Recall', 'F1_score'], text = np.round_(show_metrics[0].values,4),
                    textposition = 'auto',
                   orientation = 'h', opacity = 0.8,marker=dict(
            color=colors,
            line=dict(color='#000000',width=1.5)))
    
    # Plot roc curve
    
    model_roc_auc = round(roc_auc_score(y_test, y_score) , 3)
    fpr, tpr, t = roc_curve(y_test, y_score)
    trace3 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : ",
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2), fill='tozeroy')
    trace4 = go.Scatter(x = [0,1],y = [0,1],
                        line = dict(color = ('black'),width = 1.5,
                        dash = 'dot'))
    
    # Precision-recall curve
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_score)
    trace5 = go.Scatter(x = recall, y = precision,
                        name = "Precision" + str(precision),
                        line = dict(color = ('lightcoral'),width = 2), fill='tozeroy')
    
    # Subplots
    
    fig = tls.make_subplots(rows=2, cols=2, print_grid=False, 
                        subplot_titles=('Confusion Matrix',
                                        'Metrics',
                                        'ROC curve'+" "+ '('+ str(model_roc_auc)+')',
                                        'Precision - Recall curve'))
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,2,1)
    fig.append_trace(trace4,2,1)
    fig.append_trace(trace5,2,2)
    
    fig['layout'].update(showlegend = False, title = '<b>Model performance</b><br>'+str(model),
                        autosize = False, height = 900,width = 830,
                        plot_bgcolor = 'rgba(240,240,240, 0.95)',
                        paper_bgcolor = 'rgba(240,240,240, 0.95)',
                        margin = dict(b = 195))
    fig["layout"]["xaxis2"].update((dict(range=[0, 1])))
    fig["layout"]["xaxis3"].update(dict(title = "false positive rate"))
    fig["layout"]["yaxis3"].update(dict(title = "true positive rate"))
    fig["layout"]["xaxis4"].update(dict(title = "recall"), range = [0,1.05])
    fig["layout"]["yaxis4"].update(dict(title = "precision"), range = [0,1.05])
    fig.layout.titlefont.size = 14
    
    py.iplot(fig)

In [37]:
# Define feature importance plot

In [38]:
def features_imp(model, cf) : 

    coefficients  = pd.DataFrame(model.feature_importances_)
    column_data     = pd.DataFrame(list(data))
    coef_sumry    = (pd.merge(coefficients,column_data,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    coef_sumry = coef_sumry[coef_sumry["coefficients"] !=0]
    trace = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                    name = "coefficients",
                    marker = dict(color = coef_sumry["coefficients"],
                                  colorscale = "Viridis",
                                  line = dict(width = .6,color = "black")))
    layout = dict(title =  'Feature Importances xgb_cfl')
                    
    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

In [39]:
# Define cumulative gains curve

In [40]:
#cumulative gain curve
def cum_gains_curve(model):
    pos = pd.get_dummies(y_test).as_matrix()
    pos = pos[:,1] 
    npos = np.sum(pos)
    index = np.argsort(y_score) 
    index = index[::-1] 
    sort_pos = pos[index]
    #cumulative sum
    cpos = np.cumsum(sort_pos) 
    #recall
    recall = cpos/npos 
    #size obs test
    n = y_test.shape[0] 
    size = np.arange(start=1,stop=369,step=1) 
    #proportion
    size = size / n 
    #plots
    model = 'xgb_cfl'
    trace1 = go.Scatter(x = size,y = recall,
                        name = "Lift curve",
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2))
    trace2 = go.Scatter(x = size,y = size,
                        name = "Baseline",
                        showlegend=False,
                        line = dict(color = ('black'),width = 1.5,
                        dash = 'dot'))

    layout = dict(title = 'Cumulative gains curve'+' '+str(model),
                  yaxis = dict(title = 'Percentage positive targeted',zeroline = False),
                  xaxis = dict(title = 'Percentage contacted', zeroline = False)
                 )

    fig  = go.Figure(data = [trace1,trace2], layout = layout)
    py.iplot(fig)

In [41]:
# Define cross validation metrics

In [42]:
# Cross val metric
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

In [43]:
# Prepare dataset

In [44]:
#  Define (X, y)

In [45]:
# Def X and Y
y = np.array(data.Attrition.tolist())
data = data.drop('Attrition', 1)
X = np.array(data.as_matrix())

AttributeError: 'DataFrame' object has no attribute 'as_matrix'

In [None]:
# Train test split

In [None]:
# Train_test split
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = random_state)

In [None]:
#  XGBoost - RandomizedSearchCV to optimize hyperparameters: XGBoost(Extreme Gradient Boosting)

# Fitting xgboost to training set

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
        
xgb_cfl = xgb.XGBClassifier(n_jobs = -1)


# A parameter grid for XGBoost
params = {
        'n_estimators' : [100, 200, 500, 750],
        'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.25],
        'min_child_weight': [1, 5, 7, 10],
        'gamma': [0.1, 0.5, 1, 1.5, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 12]
        }

folds = 5
param_comb = 800

random_search = RandomizedSearchCV(xgb_cfl, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=-1, cv=5, verbose=3, random_state=42)

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
#----------------------------# random_search.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

In [None]:
# XGBoost - Modeling with best hyperparameters = 89.11

In [None]:
# XGBoost - Modeling and performance plot

In [None]:
# xgb 
xgb_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                           colsample_bytree=0.8, gamma=1.5, learning_rate=0.05,
                           max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
                           n_estimators=200, n_jobs=-1, nthread=None,
                           objective='binary:logistic', random_state=0, reg_alpha=0,
                           reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
                           subsample=0.6)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)[:,1]

model_performance_plot('xgb_clf')

In [None]:
# XGBoost - Feature importance :- 

#     Generally, importance provides a score that indicates how useful 
#     or valuable each feature was in the construction of the boosted decision trees within the model.
#     The more an attribute is used to make key decisions with decision trees, 
#     the higher its relative importance.
    
    

In [None]:
features_imp(xgb_clf, 'features')

In [None]:
#feature importance plot TOP 40
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
def plot_feature_importance(model):
    tmp = pd.DataFrame({'Feature': list(data), 'Feature importance': model.feature_importances_})
    tmp = tmp.sort_values(by='Feature importance',ascending=False).head(15)
    plt.figure(figsize = (10,12))
    plt.title('Top 15 - Features importance - XGBoost',fontsize=14)
    s = sns.barplot(y='Feature',x='Feature importance',data=tmp, orient='h')
    s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.show()

In [None]:
plot_feature_importance(xgb_clf)

In [None]:
# XGBoost - Cumulative gain curve

In [None]:
cum_gains_curve(xgb_clf)

In [None]:
# XGBoost - Cross validation (5 folds)

In [None]:
# Cross val score
cross_val_metrics(xgb_clf)