In [None]:
###################
# LIBRARIES
###################

import joblib
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [None]:
matplotlib.use("Qt5Agg")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 170)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, RandomizedSearchCV, validation_curve, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, plot_confusion_matrix


In [None]:
###################
# FUNCTIONS
###################


def standart_scaler(col_name):
    return (col_name - col_name.mean()) / col_name.std()

def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

def cat_summary(dataframe, col_name, plot=False):

    if dataframe[col_name].dtypes == "bool":
        dataframe[col_name] = dataframe[col_name].astype(int)

        print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                            "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
        print("##########################################")

        if plot:
            sns.countplot(x=dataframe[col_name], data=dataframe)
            plt.show(block=True)
    else:
        print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                            "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
        print("##########################################")

        if plot:
            sns.countplot(x=dataframe[col_name], data=dataframe)
            plt.show(block=True)

def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist()
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

def grab_col_names(dataframe, cat_th=10,  car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.

    Parameters
    ----------
    dataframe: dataframe
        değişken isimleri alınmak istenen dataframe'dir.
    cat_th: int, float
        numerik fakat kategorik olan değişkenler için sınıf eşik değeri
    car_th: int, float
        kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    -------
    cat_cols: list
        Kategorik değişken listesi
    num_cols: list
        Numerik değişken listesi
    cat_but_car: list
        Kategorik görünümlü kardinal değişken listesi

    Notes
    ------
    cat_cols + num_cols + cat_but_car = toplam değişken sayısı
    num_but_cat cat_cols'un içerisinde.

    """
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object", "bool"]]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < 10 and dataframe[col].dtypes in ["int", "float"]]

    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > 20 and str(dataframe[col].dtypes) in ["category", "object"]]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["int", "float"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean(),
                        "Count": dataframe[categorical_col].value_counts(),
                        "Ratio": 100 * dataframe[categorical_col].value_counts() / len(dataframe)}), end="\n\n\n")

def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")

def high_correlated_cols(dataframe, plot=False, corr_th=0.90):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        sns.heatmap(corr, cmap="RdBu")
        plt.show()
    return drop_list

def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)

    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])

    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = round(low_limit, 0)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = round(up_limit, 0)

def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

def missing_vs_target(dataframe, target, na_columns):
    temp_df = dataframe.copy()
    for col in na_columns:
        temp_df[col + '_NA_FLAG'] = np.where(temp_df[col].isnull(), 1, 0)
    na_flags = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns
    for col in na_flags:
        print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(col)[target].mean(),
                            "Count": temp_df.groupby(col)[target].count()}), end="\n\n\n")

def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df

def check_skew(df_skew, column):
    skew = stats.skew(df_skew[column])
    skewtest = stats.skewtest(df_skew[column])
    plt.title('Distribution of ' + column)
    sns.distplot(df_skew[column], color="g")
    print("{}'s: Skew: {}, : {}".format(column, skew, skewtest))
    return


In [None]:
## Variable Information ##

# CustomerId : Müşteri İd’si
# Gender : Cinsiyet
# SeniorCitizen : Müşterinin yaşlı olup olmadığı (1, 0)
# Partner : Müşterinin bir ortağı olup olmadığı (Evet, Hayır) ? Evli olup olmama
# Dependents : Müşterinin bakmakla yükümlü olduğu kişiler olup olmadığı (Evet, Hayır) (Çocuk, anne, baba, büyükanne)
# tenure : Müşterinin şirkette kaldığı ay sayısı
# PhoneService : Müşterinin telefon hizmeti olup olmadığı (Evet, Hayır)
# MultipleLines : Müşterinin birden fazla hattı olup olmadığı (Evet, Hayır, Telefon hizmeti yok)
# InternetService : Müşterinin internet servis sağlayıcısı (DSL, Fiber optik, Hayır)
# OnlineSecurity : Müşterinin çevrimiçi güvenliğinin olup olmadığı (Evet, Hayır, İnternet hizmeti yok)
# OnlineBackup : Müşterinin online yedeğinin olup olmadığı (Evet, Hayır, İnternet hizmeti yok)
# DeviceProtection : Müşterinin cihaz korumasına sahip olup olmadığı (Evet, Hayır, İnternet hizmeti yok)
# TechSupport : Müşterinin teknik destek alıp almadığı (Evet, Hayır, İnternet hizmeti yok)
# StreamingTV : Müşterinin TV yayını olup olmadığı (Evet, Hayır, İnternet hizmeti yok) Müşterinin, bir üçüncü taraf sağlayıcıdan televizyon programları yayınlamak için İnternet hizmetini kullanıp kullanmadığını gösterir
# StreamingMovies : Müşterinin film akışı olup olmadığı (Evet, Hayır, İnternet hizmeti yok) Müşterinin bir üçüncü taraf sağlayıcıdan film akışı yapmak için İnternet hizmetini kullanıp kullanmadığını gösterir
# Contract : Müşterinin sözleşme süresi (Aydan aya, Bir yıl, İki yıl)
# PaperlessBilling : Müşterinin kağıtsız faturası olup olmadığı (Evet, Hayır)
# PaymentMethod : Müşterinin ödeme yöntemi (Elektronik çek, Posta çeki, Banka havalesi (otomatik), Kredi kartı (otomatik))
# MonthlyCharges : Müşteriden aylık olarak tahsil edilen tutar
# TotalCharges : Müşteriden tahsil edilen toplam tutar
# Churn : Müşterinin kullanıp kullanmadığı (Evet veya Hayır) - Geçen ay veya çeyreklik içerisinde ayrılan müşteriler

In [None]:
# Reading data

telco = pd.read_csv("Telco-Customer-Churn.csv")

df = telco.copy()

In [None]:
check_df(df)

In [None]:
## Total Charges should be numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [None]:
## There is only 11 NA values, it is OK to drop them.
df.dropna(inplace=True)

In [None]:
df["Churn"] = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
## Dealing with imbalanced dataset:
# I'll use bootstrapping but gotta respect the frequency difference between Churn levels
## To avoid data leakage to testing, I'll not concatenate.

train, test = train_test_split(df, test_size=0.2, stratify=df.Churn, random_state=26)

train_churned = train.loc[train["Churn"] == 1, :]

sampled_within = train_churned.sample(n=(4130-1495), replace=True, random_state=26)

train_new = train.append(sampled_within, ignore_index=True)

In [None]:
## Defining a function to create new column

def pay_auto(x):
    if x == "Bank transfer (automatic)":
        return 1
    elif x == "Credit card (automatic)":
        return 1
    else:
        return 0

In [None]:
## Is payment auto?
train_new["IsPayAuto"] = train_new.loc[:, "PaymentMethod"].apply(lambda x: pay_auto(x))

In [None]:
## Number of services
train_new["TotalServices"] = (train_new[['PhoneService', 'InternetService', 'OnlineSecurity',
                                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                       'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)

In [None]:
## Any protection?
train_new["NEW_noProt"] = train_new.apply(lambda x: 1 if (x["OnlineBackup"] != "Yes") or (x["DeviceProtection"] != "Yes") or (x["TechSupport"] != "Yes") else 0, axis=1)

In [None]:
## Dropping the redundant column

train_new.drop("PhoneService", axis=1, inplace=True)

In [None]:
cat_, num_, cat_but_car_ = grab_col_names(train_new)

In [None]:
cat_cols = [col for col in train_new if train_new[col].dtypes == "object" and col not in cat_but_car_]

In [None]:
manual_cols = ["StreamingMovies", "StreamingTV", "TechSupport", "DeviceProtection", "OnlineBackup", "OnlineSecurity", "InternetService"]

In [None]:
train_new = one_hot_encoder(train_new, [col for col in train_new if str(col) not in manual_cols and str(col) in cat_cols], True)
train_new = one_hot_encoder(train_new, manual_cols, False)

In [None]:
train_new.drop([col for col in train_new if str(col).endswith("No internet service")], axis=1, inplace=True)
train_new.drop("InternetService_No", axis=1, inplace=True)

In [None]:
test["IsPayAuto"] = test.loc[:, "PaymentMethod"].apply(lambda x: pay_auto(x))
test["TotalServices"] = (test[['PhoneService', 'InternetService', 'OnlineSecurity',
                                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                       'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)
test["NEW_noProt"] = test.apply(lambda x: 1 if (x["OnlineBackup"] != "Yes") or (x["DeviceProtection"] != "Yes") or (x["TechSupport"] != "Yes") else 0, axis=1)
test.drop("PhoneService", axis=1, inplace=True)

In [None]:
cat_, num_, cat_but_car_ = grab_col_names(test)
cat_cols = [col for col in test if test[col].dtypes == "object" and col not in cat_but_car_]

In [None]:
test = one_hot_encoder(test, [col for col in test if str(col) not in manual_cols and str(col) in cat_cols], True)
test = one_hot_encoder(test, manual_cols, False)

test.drop([col for col in test if str(col).endswith("No internet service")], axis=1, inplace=True)
test.drop("InternetService_No", axis=1, inplace=True)

In [None]:
X = train_new.drop(["Churn", "customerID"], axis=1)
y = train_new["Churn"]
X_test = test.drop(["Churn", "customerID"], axis=1)
y_test = test["Churn"]

In [None]:
## Logistic Regression

lr_model = LogisticRegression(random_state=26, max_iter=1000, class_weight="balanced").fit(X, y)

In [None]:
cv_results = cross_validate(lr_model,
                           X, y,
                           cv=5,
                           scoring=["f1", "accuracy", "precision", "recall", "roc_auc"])

In [None]:
print(cv_results["test_f1"].mean())
print(cv_results['test_accuracy'].mean())
print(cv_results['test_precision'].mean())
print(cv_results['test_recall'].mean())
print(cv_results["test_roc_auc"].mean())

In [None]:
y_pred = lr_model.predict(X_test)

In [None]:
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred,y_test),3)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 3)}")
print(f"F1: {round(f1_score(y_pred,y_test), 3)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 3)}")

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(lr_model, X_test, y_test, ax=g)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
## To take a different approach towards imbalance of the data in respect to churn
## we may change the class_weight accordingly and search through best values for F1 score

lr = LogisticRegression(max_iter=1000, random_state=26)

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X, y)

#Ploting the score for different values of weight
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [None]:
## Or, we can change class_weights manually and give more importance to churn
## depending on the business approach

#importing and training the model
lr = LogisticRegression(max_iter=1000, random_state=26, class_weight={0: 0.25, 1: 0.75})
lr.fit(X, y)

# Predicting on the test data
pred_test = lr.predict(X_test)

%matplotlib inline

fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(lr, X_test, y_test, ax=g)

In [None]:
len(train_new.columns)

In [None]:
## LGBM
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=26, class_weight="balanced").fit(X,y)

In [None]:
cv_results = cross_validate(lgbm, X, y, cv=5, scoring=["f1", "recall", "precision", "accuracy", "roc_auc"])

In [None]:
print(cv_results["test_f1"].mean())
print(cv_results['test_accuracy'].mean())
print(cv_results['test_precision'].mean())
print(cv_results['test_recall'].mean())
print(cv_results["test_roc_auc"].mean())

In [None]:
y_pred = lgbm.predict(X_test)

In [None]:
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred,y_test),3)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 3)}")
print(f"F1: {round(f1_score(y_pred,y_test), 3)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 3)}")

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(lgbm, X_test, y_test, ax=g)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1, 0.2],
               "n_estimators": [2500, 3000, 3500, 4000],
               "colsample_bytree": [0.3, 0.5, 0.7, 1]}

# 300, 500, 800, 1000, 1500, 2000, 

In [None]:
lgbm_best_grid = GridSearchCV(lgbm,
                              lgbm_params,
                              cv=5,
                              n_jobs=-1,
                              verbose=1).fit(X, y)

In [None]:
lgbm_best_grid.best_params_

In [None]:
lgbm_final = LGBMClassifier(**lgbm_best_grid.best_params_, random_state=26, class_weight="balanced").fit(X, y)

In [None]:
y_pred = lgbm_final.predict(X_test)

In [None]:
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred,y_test),3)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 3)}")
print(f"F1: {round(f1_score(y_pred,y_test), 3)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 3)}")

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(lgbm_final, X_test, y_test, ax=g)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
#CatBoost
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier(random_state=26, verbose=False, auto_class_weights="Balanced").fit(X, y)

In [None]:
cv_results = cross_validate(cb_model, X, y, cv=5, scoring=["f1", "recall", "precision", "accuracy", "roc_auc"])

In [None]:
print(cv_results["test_f1"].mean())
print(cv_results['test_accuracy'].mean())
print(cv_results['test_precision'].mean())
print(cv_results['test_recall'].mean())
print(cv_results["test_roc_auc"].mean())

In [None]:
y_pred = cb_model.predict(X_test)

In [None]:
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred,y_test),3)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 3)}")
print(f"F1: {round(f1_score(y_pred,y_test), 3)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 3)}")

In [None]:
cb_params = {"iterations": [500, 1000, 1500],
             "learning_rate": [0.01, 0.1, 0.2],
             "depth": [3, 6, 8, 10, 15]}

In [None]:
cb_best_grid = GridSearchCV(cb_model,
                            cb_params,
                            cv=3,
                            verbose=1).fit(X, y)

In [None]:
cb_final = cb_model.set_params(**cb_best_grid.best_params_, random_state=26, verbose=False, auto_class_weights="Balanced").fit(X, y)

In [None]:
y_pred = cb_final.predict(X_test)

In [None]:
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 3)}")
print(f"Recall: {round(recall_score(y_pred,y_test),3)}")
print(f"Precision: {round(precision_score(y_pred,y_test), 3)}")
print(f"F1: {round(f1_score(y_pred,y_test), 3)}")
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 3)}")

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(cb_model, X_test, y_test, ax=g)

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10, 10))
g = fig.add_subplot(1,1,1)
plot_confusion_matrix(cb_final, X_test, y_test, ax=g)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

In [None]:
plot_importance(cb_final, X_test, 10, False)