In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn_model.export import Model

In [3]:
df = pd.read_csv("assets/loan.csv")
df

Unnamed: 0,Gender,Married,Dependents,Education,Self Employed,Applicant Income,Loan Amount,Loan Term,Property Area,Approved
0,Male,Yes,1,Graduate,No,6091.0,128000,360,Rural,N
1,Male,Yes,0,Graduate,Yes,3000.0,66000,360,Urban,Y
2,Male,Yes,0,Not Graduate,No,4941.0,120000,360,Urban,Y
3,Male,No,0,Graduate,No,6000.0,141000,360,Urban,Y
4,Male,Yes,2,Graduate,Yes,9613.0,267000,360,Urban,Y
...,...,...,...,...,...,...,...,...,...,...
571,Female,No,0,Graduate,No,2900.0,71000,360,Rural,Y
572,Male,Yes,3,Graduate,No,4106.0,40000,180,Rural,Y
573,Male,Yes,1,Graduate,No,8312.0,253000,360,Urban,Y
574,Male,Yes,2,Graduate,No,7583.0,187000,360,Urban,Y


In [4]:
df.dtypes

Gender               object
Married              object
Dependents            int64
Education            object
Self Employed        object
Applicant Income    float64
Loan Amount           int64
Loan Term             int64
Property Area        object
Approved             object
dtype: object

In [5]:
# Convert object fields to categorical fields
df['Gender'] = df['Gender'].astype('category')
df['Married'] = df['Married'].astype('category')
df['Education'] = df['Education'].astype('category')
df['Self Employed'] = df['Self Employed'].astype('category')
df['Property Area'] = df['Property Area'].astype('category')
df['Approved'] = df['Approved'].astype('category')
df.dtypes

Gender              category
Married             category
Dependents             int64
Education           category
Self Employed       category
Applicant Income     float64
Loan Amount            int64
Loan Term              int64
Property Area       category
Approved            category
dtype: object

In [6]:
# Check the values of categorical columns
cat_columns = df.select_dtypes(['category']).columns
for col in cat_columns:
    print(df.dtypes[col].categories.tolist())

['Female', 'Male']
['No', 'Yes']
['Graduate', 'Not Graduate']
['No', 'Yes']
['Rural', 'Semiurban', 'Urban']
['N', 'Y']


In [7]:
# dfX and dfY is created to keep the field details of original dataset intact
dfX = df.drop("Approved", axis = 1)
dfY = df[["Approved"]]

In [8]:
# Convert Categorical to Integers
df_xy = df.copy()
df_xy[cat_columns] = df_xy[cat_columns].apply(lambda x: x.cat.codes)
df_xy.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self Employed,Applicant Income,Loan Amount,Loan Term,Property Area,Approved
0,1,1,1,0,0,6091.0,128000,360,0,0
1,1,1,0,0,1,3000.0,66000,360,2,1
2,1,1,0,1,0,4941.0,120000,360,2,1
3,1,0,0,0,0,6000.0,141000,360,2,1
4,1,1,2,0,1,9613.0,267000,360,2,1


In [9]:
X, y = df_xy.drop("Approved", axis = 1).values, df_xy["Approved"].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.15,
                                                   random_state = 3)

In [11]:
# Model
dtree = DecisionTreeClassifier(max_depth = 4)

In [12]:
y_pred = dtree.fit(X_train, y_train.ravel()).predict(X_test)
1 - sum((y_test.ravel()- y_pred)**2)/len(y_pred)

0.7011494252873562

In [13]:
# Get the probabibity of class 0 in the leaf and assign a final class to that leaf based on
# cutoff value
prob_A = pd.DataFrame({"leaf": dtree.apply(X_train), 
                       "actualY": y_train}).groupby(['leaf']).mean().to_dict()["actualY"]
cutoff = 0.5
leaf_class = { leaf : 0 if prob < cutoff else 1 for leaf, prob in prob_A.items()}

In [14]:
mdl = Model()
mdl.add_fields(dfX, dfY)
mdl.add_leaf_class(leaf_class)
mdl.add_model(dtree)

In [15]:
print(mdl.exportJSON())

{
    "input": {
        "Gender": {
            "type": "category",
            "values": [
                "Female",
                "Male"
            ]
        },
        "Married": {
            "type": "category",
            "values": [
                "No",
                "Yes"
            ]
        },
        "Dependents": {
            "type": "int"
        },
        "Education": {
            "type": "category",
            "values": [
                "Graduate",
                "Not Graduate"
            ]
        },
        "Self Employed": {
            "type": "category",
            "values": [
                "No",
                "Yes"
            ]
        },
        "Applicant Income": {
            "type": "float"
        },
        "Loan Amount": {
            "type": "int"
        },
        "Loan Term": {
            "type": "int"
        },
        "Property Area": {
            "type": "category",
            "values": [
                "Rural",
            

In [15]:
mdl.exportJSON('tree.json')

### Extracting Decision Tree Rules

In [16]:
from sklearn_model.utils import JMLM

#### 1. From Model

In [17]:
jmlm = JMLM.fromModel(mdl)
jmlm.extractRules()

[{'class': 0, 'rule': '(Applicant Income <= 2381.5) and (Education <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender <= 0.5)'},
 {'class': 0,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount <= 61000.0)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount > 61000.0)'},
 {'class': 0,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income > 20199.5) and (Loan Amount <= 585000.0) and (Loan Amount <= 195500.0)'},
 {'class': 0,


#### 2. From JMLM String

In [18]:
jmlm_string = mdl.exportJSON()
jmlm = JMLM.fromString(jmlm_string)
jmlm.extractRules()

[{'class': 0, 'rule': '(Applicant Income <= 2381.5) and (Education <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender <= 0.5)'},
 {'class': 0,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount <= 61000.0)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount > 61000.0)'},
 {'class': 0,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income > 20199.5) and (Loan Amount <= 585000.0) and (Loan Amount <= 195500.0)'},
 {'class': 0,


#### 3. From JMLM File

In [22]:
jmlm = JMLM.fromFile("tree.json")
jmlm.extractRules()

[{'class': 0, 'rule': '(Applicant Income <= 2381.5) and (Education <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender <= 0.5)'},
 {'class': 0,
  'rule': '(Applicant Income <= 2381.5) and (Education > 0.5) and (Gender > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount <= 61000.0)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount > 61000.0)'},
 {'class': 0,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income > 20199.5) and (Loan Amount <= 585000.0) and (Loan Amount <= 195500.0)'},
 {'class': 0,


### Rules with Numerical to Categorical Transformation for Categorical Variables

In [23]:
jmlm.extractRules(numericalToCategorical = True)

[{'class': 0,
  'rule': "(Applicant Income <= 2381.5) and (Education == 'Graduate')"},
 {'class': 1,
  'rule': "(Applicant Income <= 2381.5) and (Education == 'Not Graduate') and (Gender == 'Female')"},
 {'class': 0,
  'rule': "(Applicant Income <= 2381.5) and (Education == 'Not Graduate') and (Gender == 'Male')"},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount <= 61000.0)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term <= 420.0) and (Loan Amount > 61000.0)'},
 {'class': 0,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents <= 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income <= 20199.5) and (Loan Term > 420.0) and (Dependents > 0.5)'},
 {'class': 1,
  'rule': '(Applicant Income > 2381.5) and (Applicant Income > 20199.5) and (Loan Amount <= 585000.0) and (