In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn import tree
import graphviz

In [8]:
# Defining the datatype of each column for reading the CSV file
dataTypes = {
    "UniqueID": np.uint16,
    "disbursed_amount": np.float64,
    "asset_cost": np.float64,
    "ltv": np.float64,
    "branch_id": np.object,
    "supplier_id": np.object,
    "manufacturer_id": np.object,
    "Current_pincode_ID": np.object,
    "Date.of.Birth": np.object,
    "Employment.Type": np.object,
    "DisbursalDate": np.object,
    "State_ID": np.object,
    "Employee_code_ID": np.object,
    "MobileNo_Avl_Flag": np.uint8,
    "Aadhar_flag": np.uint8,
    "PAN_flag": np.uint8,
    "VoterID_flag": np.uint8,
    "Driving_flag": np.uint8,
    "Passport_flag": np.uint8,
    "PERFORM_CNS.SCORE": np.uint16,
    "PERFORM_CNS.SCORE.DESCRIPTION": np.object,
    "PRI.NO.OF.ACCTS": np.uint8,
    "PRI.ACTIVE.ACCTS": np.uint8,
    "PRI.OVERDUE.ACCTS": np.uint8,
    "PRI.CURRENT.BALANCE": np.float64,
    "PRI.SANCTIONED.AMOUNT": np.float64,
    "PRI.DISBURSED.AMOUNT": np.float64,
    "SEC.NO.OF.ACCTS": np.uint8,
    "SEC.ACTIVE.ACCTS": np.uint8,
    "SEC.OVERDUE.ACCTS": np.uint8,
    "SEC.CURRENT.BALANCE": np.float64,
    "SEC.SANCTIONED.AMOUNT": np.float64,
    "SEC.DISBURSED.AMOUNT": np.float64,
    "PRIMARY.INSTAL.AMT": np.float64,
    "SEC.INSTAL.AMT": np.float64,
    "NEW.ACCTS.IN.LAST.SIX.MONTHS": np.uint8,
    "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS": np.uint8,
    "AVERAGE.ACCT.AGE": np.object,
    "CREDIT.HISTORY.LENGTH": np.object,
    "NO.OF_INQUIRIES": np.uint8,
    "loan_default": np.uint8
}

# Input data files are available in the "../data/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
data = pd.read_csv("data/train.csv", dtype=dataTypes)

In [9]:
def parseDuration(x):
    x_yrs = x.split()[0]
    x_mon = x.split()[1]
    n_years = x_yrs[:(len(x_yrs)-3)]
    n_mon = x_mon[:(len(x_mon)-3)]
    return int(n_years) * 12 + int(n_mon)

def parseDate(v):
    if v.split("-")[-1]=='00' or v.split("-")[-1]=='18':
        return "-".join(v.split("-")[:-1])+'-20'+"".join(v.split("-")[-1])
    else:
        return "-".join(v.split("-")[:-1])+'-19'+"".join(v.split("-")[-1])

data['Date.of.Birth'] = pd.to_datetime(data['Date.of.Birth'].apply(parseDate), format='%d-%m-%Y', errors='coerce')
data['DisbursalDate'] = pd.to_datetime(data['DisbursalDate'].apply(parseDate), format='%d-%m-%Y')
data['AgeAtDisbursal'] = (data['DisbursalDate'] - data['Date.of.Birth']).astype('<m8[Y]')
data['Employment.Type'].fillna('Other', inplace=True)
data['AVERAGE.ACCT.AGE'] = data['AVERAGE.ACCT.AGE'].apply(parseDuration)
data['CREDIT.HISTORY.LENGTH'] = data['CREDIT.HISTORY.LENGTH'].apply(parseDuration)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = pd.Categorical(data['PERFORM_CNS.SCORE.DESCRIPTION'])
data['CNS_DESC_CODES'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].cat.codes
data['Employment.Type'] = pd.Categorical(data['Employment.Type'])
data['EMP_TYPE'] = data['Employment.Type'].cat.codes

columnsToDrop = {
    "UniqueID",
    "Date.of.Birth",
    "DisbursalDate",
    'PERFORM_CNS.SCORE.DESCRIPTION',
    'Employment.Type'    
}

data = data.drop(columnsToDrop, axis=1)

data = pd.get_dummies(data, prefix=['BI','SI','MI','CPID','ET','SID','ECID','AF','PF','VF','DF','PAF','PCS','PCSD','LD'],
                              columns=['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','EMP_TYPE','State_ID','Employee_code_ID','Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','Passport_flag','PERFORM_CNS.SCORE','CNS_DESC_CODES','loan_default'],drop_first=1) 
print("Done")

Done


In [10]:
X = data.drop('LD_1', 1)
y = data['LD_1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

del X
del y
del data
import gc
gc.collect()

66

In [7]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
predict_prob = clf.predict_proba(X_test)

MemoryError: 

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = metrics.roc_curve(y_test, predict_prob[:,1])
roc_auc = metrics.auc(fpr, tpr)
print("ROC AUC Score: ", roc_auc)

In [None]:
# method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Applications/graphviz/bin/'
dot_data = tree.export_graphviz(clf, out_file=None,
                                    feature_names=list(X_train.columns.values),
                                    class_names=['0','1'],
                                    filled=True, rounded=True,
                                    special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

In [None]:
feature_importances = pd.DataFrame(clf.feature_importances_, index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

In [None]:
feature_importances