In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [3]:
dataTypes = {
    "disbursed_amount": np.float64,
    "asset_cost": np.float64,
    "ltv": np.float64,
    "branch_id": np.object,
    "supplier_id": np.object,
    "manufacturer_id": np.object,
    "Current_pincode_ID": np.object,
    "State_ID": np.object,
    "Employee_code_ID": np.object,
    "Aadhar_flag": np.uint8,
    "PAN_flag": np.uint8,
    "VoterID_flag": np.uint8,
    "Driving_flag": np.uint8,
    "PRI.NO.OF.ACCTS": np.int64,
    "PRI.ACTIVE.ACCTS": np.int64,
    "PRI.OVERDUE.ACCTS": np.int64,
    "PRI.CURRENT.BALANCE ": np.float64,
    "PRI.SANCTIONED.AMOUNT": np.float64,
    "PRI.DISBURSED.AMOUNT": np.float64,
    "PRIMARY.INSTAL.AMT": np.float64,
    "NEW.ACCTS.IN.LAST.SIX.MONTHS": np.int64,
    "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS": np.int64,
    "AVERAGE.ACCT.AGE": np.int64,
    "CREDIT.HISTORY.LENGTH": np.int64,
    "NO.OF_INQUIRIES": np.int64,
    "Sanctioned": np.float64,
    "AgeAtDisbursal": np.float64,
    "TOTAL.ACTIVE.ACCTS": np.int64,
    "TOTAL.CURRENT.BALANCE": np.float64,
    "TOTAL.DISBURSED.AMOUNT": np.int64,
    "TOTAL.NO.OF.ACCTS": np.int64,
    "TOTAL.OVERDUE.ACCTS": np.int64,
    "TOTAL.CLEAN.ACCTS": np.int64,
    "NO.OF.ACC.BF.SIX.MONTH": np.int64,
    "OVERDUE.ACC.BF.SIX.MONTHS": np.int64,
    "TOTAL.DEACTIVE.ACCTS": np.int64,
    "TOTAL.INSTL.AMOUNT": np.float64,
    "TOTAL.CLEARED.ACCTS": np.int64,
    "Not_Scored": np.int32,
    "No_History": np.int32,
    "Very_Low_Risk": np.int32,
    "Low_Risk": np.int32,
    "Medium_Risk": np.int32,
    "Very_High_Risk": np.int32,
    "Employment.Type_Other": np.uint8,
    "Employment.Type_Salaried": np.uint8,
    "Employment.Type_Self employed": np.uint8,
    "PERFORM_CNS.SCORE_0": np.uint8,
    "PERFORM_CNS.SCORE_3": np.uint8,
    "PERFORM_CNS.SCORE_4": np.uint8,
    "PERFORM_CNS.SCORE_5": np.uint8,
    "PERFORM_CNS.SCORE_6": np.uint8,
    "PERFORM_CNS.SCORE_7": np.uint8,
    "PERFORM_CNS.SCORE_8": np.uint8
}

#Read the input CSV data set
X_train = pd.read_csv("data/X_train.csv", dtype=dataTypes)
X_test = pd.read_csv("data/X_test.csv", dtype=dataTypes)
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

In [60]:
from sklearn.naive_bayes import GaussianNB

# Instantiate the classifier
gnb = GaussianNB(priors=None, var_smoothing=10e-90)

# Train classifier
gnb.fit(X_train, y_train.values.ravel())

y_pred = gnb.predict(X_test)

cm = confusion_matrix(y_test.values.ravel(), y_pred)
print(cm)

#Applying 10-fold cross validation
scores = cross_val_score(estimator=gnb, X = X_train, y=y_train.values.ravel(), cv=10, scoring='roc_auc')
print("Mean ROC AUC (10-fold): ", np.mean(scores))

[[50438  4293]
 [13280  1936]]
Mean ROC AUC (10-fold):  0.6018811670481743


In [61]:
scores

array([0.60204421, 0.60490206, 0.59090541, 0.60965686, 0.60463466,
       0.59498614, 0.60196694, 0.60747542, 0.59629732, 0.60594265])