In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import sklearn_json as skljson


In [2]:
appRecord = pd.read_csv("application_record.csv");
creditRecord = pd.read_csv("credit_record.csv");

In [3]:
creditRecord.insert(1,"STATUSINT",creditRecord['STATUS']);

In [4]:
"""
Original Data
0: 1-29 days past due 
1: 30-59 days past due 
2: 60-89 days overdue 
3: 90-119 days overdue 
4: 120-149 days overdue 
5: Overdue or bad debts, write-offs for more than 150 days 
C: paid off that month 
X: No loan for the month
-------------------------
Numerical Encoding
15: 1-29 days past due 
45: 30-59 days past due 
75: 60-89 days overdue 
105: 90-119 days overdue 
135: 120-149 days overdue 
150: Overdue or bad debts, write-offs for more than 150 days 
0: paid off that month 
0: No loan for the month
"""
def statusToInt(status):
    if status == "X": return 0
    elif status == "C": return 0
    elif status == "0": return 15
    elif status == "1": return 45
    elif status == "2": return 75
    elif status == "3": return 105
    elif status == "4": return 145
    elif status == "5": return 150
    else: print("n/a")

In [5]:
creditRecord["STATUSINT"]=creditRecord["STATUS"].map(statusToInt)

In [6]:
creditRecord.insert(1,"TOTALMONTH",creditRecord['MONTHS_BALANCE']);

In [7]:
creditRecord["TOTALMONTH"]=creditRecord["MONTHS_BALANCE"].map(lambda x: -1*x);

In [8]:
maximum = creditRecord.groupby("ID").max()

In [9]:
sums = creditRecord.groupby("ID").sum()

In [10]:
del maximum['STATUSINT']
del maximum['MONTHS_BALANCE']
del maximum['STATUS']
maximum.head()

Unnamed: 0_level_0,TOTALMONTH
ID,Unnamed: 1_level_1
5001711,3
5001712,18
5001713,21
5001714,14
5001715,59


In [11]:
del sums['TOTALMONTH']
del sums['MONTHS_BALANCE']
sums.head()

Unnamed: 0_level_0,STATUSINT
ID,Unnamed: 1_level_1
5001711,45
5001712,150
5001713,0
5001714,0
5001715,0


In [12]:
"""
TOTALMONTH : The duration of the record in the system.
STATUSINT : The total days the person is behind the payment. 
"""
customData = pd.merge(
    maximum,
    sums,
    on="ID",
    how="inner"
)

data = pd.merge(
    appRecord,
    customData,
    on="ID",
    how="inner"
)
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TOTALMONTH,STATUSINT
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,15,60
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,14,60
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,29,105
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,4,30
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,26,0


In [13]:
# Everything before the last column and the first column is a feature for a crude model. 
X = data.iloc[:data.size,1:19];
X.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TOTALMONTH
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,15
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,14
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,29
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,4
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,26


In [14]:
y = data.iloc[:data.size,19:20];
y.head()

Unnamed: 0,STATUSINT
0,60
1,60
2,105
3,30
4,0


In [15]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
label_encoder = LabelEncoder();
X['CODE_GENDER'] = label_encoder.fit_transform(X['CODE_GENDER']);
X['FLAG_OWN_CAR'] = label_encoder.fit_transform(X['FLAG_OWN_CAR']);
X['FLAG_OWN_REALTY'] = label_encoder.fit_transform(X['FLAG_OWN_REALTY']);
X['NAME_INCOME_TYPE'] = label_encoder.fit_transform(X['NAME_INCOME_TYPE']);
X['NAME_EDUCATION_TYPE'] = label_encoder.fit_transform(X['NAME_EDUCATION_TYPE']);
X['NAME_FAMILY_STATUS'] = label_encoder.fit_transform(X['NAME_FAMILY_STATUS']);
X['NAME_HOUSING_TYPE'] = label_encoder.fit_transform(X['NAME_HOUSING_TYPE']);
"""
Some people don't have occupation information. This will throw a TypeError for the label_encoder. Let's fix this. 
"""
def jobEncodingPrep(status):
    #print(status)
    if pd.isna(status) : return "No Job Info"
    else: return status
X["OCCUPATION_TYPE"]=X["OCCUPATION_TYPE"].map(jobEncodingPrep)
#print(X)

# This is a dumb encoding, as occupation should be one-hot encoded. 
X['OCCUPATION_TYPE'] = label_encoder.fit_transform(X['OCCUPATION_TYPE']);

In [16]:
# Build a model
clf = RandomForestClassifier(random_state=0)
clf.fit(X, np.ravel(y))
print(clf.score(X,np.ravel(y)))

0.9291219793181008


In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [20]:
kf = KFold(n_splits = 10)
kf.get_n_splits(X,y)

10

In [None]:
mse = []
train_scores = []
test_scores = []

for train_index,test_index in kf.split(X):
    X_train,X_test = X.iloc[train_index], X.iloc[test_index]
    y_train,y_test = y.iloc[train_index], y.iloc[test_index]
    
    clfs = RandomForestClassifier()
    clfs.fit(X_train,np.ravel(y_train))
    y_pred = clfs.predict(X_test)
    mse.append(mean_squared_error(y_pred,y_test))
    train_scores.append(clfs.score(X_train,y_train))
    test_scores.append(clfs.score(X_test,y_test))
    

In [1]:
print("MSEs: ", mse)
print("Average MSE: ", np.mean(mse))
print("Train scores: ", train_scores)
print("Averag train", sconp.mean(train_scores))
print(test_scores)
print(np.mean(test_scores))

NameError: name 'mse' is not defined

In [18]:
clfs.score(X_train,y_train)

NameError: name 'clfs' is not defined

In [None]:
#the following is code for converting a decision tree to json by Gary Sieling from https://www.garysieling.com/blog/convert-scikit-learn-decision-trees-json/
#it doesnt work very well :(

def treeToJson(decision_tree, feature_names=None):
  from warnings import warn

  js = ""

  def node_to_str(tree, node_id, criterion):
    if not isinstance(criterion, sklearn.tree.tree.six.string_types):
      criterion = "impurity"

    value = tree.value[node_id]
    if tree.n_outputs == 1:
      value = value[0, :]

    jsonValue = ', '.join([str(x) for x in value])

    if tree.children_left[node_id] == sklearn.tree._tree.TREE_LEAF:
      return '"id": "%s", "criterion": "%s", "impurity": "%s", "samples": "%s", "value": [%s]' \
             % (node_id, 
                criterion,
                tree.impurity[node_id],
                tree.n_node_samples[node_id],
                jsonValue)
    else:
      if feature_names is not None:
        feature = feature_names[tree.feature[node_id]]
      else:
        feature = tree.feature[node_id]

      if "=" in feature:
        ruleType = "="
        ruleValue = "false"
      else:
        ruleType = "<="
        ruleValue = "%.4f" % tree.threshold[node_id]

      return '"id": "%s", "rule": "%s %s %s", "%s": "%s", "samples": "%s"' \
             % (node_id, 
                feature,
                ruleType,
                ruleValue,
                criterion,
                tree.impurity[node_id],
                tree.n_node_samples[node_id])

  def recurse(tree, node_id, criterion, parent=None, depth=0):
    tabs = "  " * depth
    js = ""

    left_child = tree.children_left[node_id]
    right_child = tree.children_right[node_id]

    js = js + "\n" + \
         tabs + "{\n" + \
         tabs + "  " + node_to_str(tree, node_id, criterion)

    if left_child != sklearn.tree._tree.TREE_LEAF:
      js = js + ",\n" + \
           tabs + '  "left": ' + \
           recurse(tree, \
                   left_child, \
                   criterion=criterion, \
                   parent=node_id, \
                   depth=depth + 1) + ",\n" + \
           tabs + '  "right": ' + \
           recurse(tree, \
                   right_child, \
                   criterion=criterion, \
                   parent=node_id,
                   depth=depth + 1)

    js = js + tabs + "\n" + \
         tabs + "}"

    return js

  if isinstance(decision_tree, sklearn.tree.tree.Tree):
    js = js + recurse(decision_tree, 0, criterion="impurity")
  else:
    js = js + recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)

  return js

In [17]:
def forest_to_json(random_forest,output_path):
    f = open(output_path,"w")

    trees = random_forest.estimators_

    for tree in trees:
        jstree = treeToJson(tree)
        f.write(jstree)

    f.close()

def forestToJson(random_forest,output_path):
    skljson.to_json(random_forest, output_path)


In [19]:
forestToJson(clf,"json_trees")