In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/home/asma/Documents/Intelligent Systems/Home Works/prison_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Age At Release,Convicting Offense Classification,Convicting Offense Type,Convicting Offense Subtype,Main Supervising District,Release Type,Part of Target Population,Recidivism - Return to Prison numeric
0,2010,2013,White,<45,D Felony,Violent,Other,3JD,Parole,Yes,1
1,2010,2013,White,>45,D Felony,Other,Other,3JD,Parole,Yes,1
2,2010,2013,White,<45,D Felony,Other,Other,5JD,Parole,Yes,1
3,2010,2013,White,>45,Other Felony,Drug,Trafficking,3JD,Parole,Yes,1
4,2010,2013,Black,<45,D Felony,Drug,Trafficking,3JD,Parole,Yes,1


In [5]:
for col in df:
    print (col, df[col].unique())

Fiscal Year Released [2010 2013 2015]
Recidivism Reporting Year [2013 2016 2018]
Race - Ethnicity ['White' 'Black']
Age At Release ['<45' '>45']
Convicting Offense Classification ['D Felony' 'Other Felony']
Convicting Offense Type ['Violent' 'Other' 'Drug']
Convicting Offense Subtype ['Other' 'Trafficking']
Main Supervising District ['3JD' '5JD']
Release Type ['Parole' 'Discharged End of Sentence']
Part of Target Population ['Yes' 'No']
Recidivism - Return to Prison numeric [1 0]


## Spliting Data to test and train

In [6]:
def train_test(data, ratio):
    data = data.sample(frac=1).reset_index(drop=True)
    n = data.shape[0]
    test = data.iloc[:int(n*ratio), :]
    train = data.iloc[int(n*ratio):n, :]
    return test, train 

## ID3 Algorithm

In [7]:
def entropy(s):
    p = s.groupby('Target').size() / s.shape[0]
    ent = p*np.log2(p)
    return  -ent.sum()

In [8]:
def InformationGain(s, attribute):
    grouped = s.groupby(attribute)
    ig = grouped.size() / s.shape[0] * grouped.apply(lambda g : entropy(g))
    return entropy(s) - ig.sum()

In [9]:
def find_root(s):
    attributes = s.drop(['Target'], axis=1)
    return attributes.apply(lambda col: InformationGain(s, col)).idxmax()

In [10]:
def ID3(s, depth, tree=None):
    
    root = find_root(s)
    branches = s[root].unique()

    if tree is None:
        tree={}
        tree[root] = {}    
    
    for child in branches:
        split = s[s[root]==child]
        
        if((depth==1) or (s.shape[1]==1) or (len(split['Target'].unique())==1)):
            tree[root][child] = split.groupby('Target').size().idxmax()
            
        else:
            tree[root][child] = ID3(split, depth-1)
        
    return tree

In [11]:
df.rename(columns = {"Recidivism - Return to Prison numeric": "Target"}, inplace= True)
test_data, train_data = train_test(df, 0.2)
decision_tree = ID3(train_data, 3)
decision_tree

{'Fiscal Year Released': {2015: {'Release Type': {'Parole': {'Age At Release': {'>45': 0,
      '<45': 0}},
    'Discharged End of Sentence': {'Main Supervising District': {'5JD': 0,
      '3JD': 0}}}},
  2013: {'Age At Release': {'<45': {'Main Supervising District': {'3JD': 1,
      '5JD': 1}},
    '>45': {'Convicting Offense Type': {'Other': 1,
      'Drug': 1,
      'Violent': 1}}}},
  2010: 1}}

In [12]:
def ID3_predict(data, tree, attribute, predict=list()):
    for key in tree[attribute].keys():
        if (type(tree[attribute][key]) == dict):
             ID3_predict(data[data[attribute]==key], tree[attribute][key], list(tree[attribute][key].keys())[0], predict)
        else:
            predict.append((data[data[attribute]==key].index, tree[attribute][key]))
    
    return predict

In [13]:
pred =[]
test_data['predict'] = 3
root = find_root(train_data)
pred = ID3_predict(test_data, decision_tree, root , pred)
for idx, label in pred:
    test_data.loc[idx, 'predict'] = label

In [14]:
def confusion_matrix(data, predict, target):
    TP = data[(data[predict] == data[target]) & (data[target]==1)].shape[0]
    TN = data[(data[predict] == data[target]) & (data[target]==0)].shape[0]
    FP = data[(data[predict] != data[target]) & (data[target]==0)].shape[0]
    FN = data[(data[predict] != data[target]) & (data[target]==1)].shape[0]
    
    matrix = pd.DataFrame(np.array([[TP,FP],[FN, TN]]),index=['Positive', 'Negetive'], columns=['Positive', 'Negetive'])
    
    return matrix

### Confusion Matrix and Accuracy for ID3 

In [15]:
m = confusion_matrix(test_data, 'predict', 'Target')
m

Unnamed: 0,Positive,Negetive
Positive,1078,219
Negetive,630,1157


In [17]:
def accuracy (matrix):
    return (matrix['Positive']['Positive'] + matrix['Negetive']['Negetive']) / (matrix.sum()).sum()

In [18]:
accuracy(m)

0.7247081712062257

## Random Forest

In [1]:
def Random_Forest(data, test_data, N):
    
    test_data['Votes'] = 0
    
    for i in range(0,N):
        resample = data.drop('Target', axis=1).sample(frac=0.35, axis=1)
        resample['Target'] = data['Target']
        resample = resample.sample(frac=0.2)
        
        decision_tree = ID3(resample, 3)
        root = find_root(resample)
        predict = ID3_predict(test_data, decision_tree, root)
        test_data['resample_predict'] = 3
        
        for idx, label in predict:
            test_data.loc[idx, 'resample_predict'] = label  
        
        test_data['Votes'] += test_data['resample_predict']
        
    test_data['Forest_predict'].loc[test_data['Votes']>(N/2)] = 1
    test_data['Forest_predict'].loc[test_data['Votes']<=(N/2)] = 0
    
    return test_data

In [None]:
test_data['Forest_predict'] = 3
test = Random_Forest(train_data, test_data, 11)

### Confusion Matrix and Accuracy for RandomForest

In [21]:
m = confusion_matrix(test, 'Forest_predict', 'Target')
m

Unnamed: 0,Positive,Negetive
Positive,1058,211
Negetive,643,1172


In [22]:
accuracy(m)

0.7230869001297017

## Implementing ID3 with sklearn

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:10], df.iloc[:,-1], test_size=0.2)

columns = X_train.columns.tolist()
for col in columns:
    if X_train.dtypes[col] == np.object:
        X_train[col] = LabelEncoder().fit_transform(X_train[col])
        X_test[col] = LabelEncoder().fit_transform(X_test[col])
        
model = RandomForestClassifier(max_depth=3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[1127,  238],
       [ 572, 1148]])

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7374392220421394