# Random Forest vs. Decision Tree
### University of Denver 

## Isabel Osgood

### Uploading the Data

In [8]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, zero_one_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



In [9]:
df = pd.read_csv("Affairs.csv")
print(df.columns)
df = df.drop([df.columns[0]], axis =1)

Index(['Unnamed: 0', 'affairs', 'gender', 'age', 'yearsmarried', 'children',
       'religiousness', 'education', 'occupation', 'rating'],
      dtype='object')


### Data Preprocessing

In [10]:
df['gender'] = df['gender'].map({'male': 1, 'female': 0})
df['children'] = df['children'].map({'no': 0, 'yes': 1})
df.head()

Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,0,1,37.0,10.0,0,3,18,7,4
1,0,0,27.0,4.0,0,4,14,6,4
2,0,0,32.0,15.0,1,1,12,1,4
3,0,1,57.0,15.0,1,5,18,6,5
4,0,1,22.0,0.75,0,2,17,6,3


In [11]:
df['affairs'] = df['affairs'].apply(lambda x: 1 if x > 0 else 0)
#print(df.head(), df.tail())
df['gender'] = df['gender'].astype('category') #categorical data
df['children'] = df['children'].astype('category') #categorical data
df['education'], uniques = df['education'].factorize(sort=True) #ordinal datatype
df['occupation'] = df['occupation'].astype('category') #categorical data
df['religiousness'], uniques = pd.factorize(df['religiousness'], sort=True)
df['rating'], uniques = pd.factorize(df['rating'], sort=True)
df.head()

Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,0,1,37.0,10.0,0,2,5,7,3
1,0,0,27.0,4.0,0,3,2,6,3
2,0,0,32.0,15.0,1,0,1,1,3
3,0,1,57.0,15.0,1,4,5,6,4
4,0,1,22.0,0.75,0,1,4,6,2


### Data Splitting

In [12]:
y = df['affairs']
X = df.drop(['affairs'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)


### Model Building 

In [13]:
#build both decision tree and random forest for comparision 
from sklearn.ensemble import RandomForestClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, y_train)

rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)

rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
#decision tree 
dt_param_grid = {"max_depth": range(1, dt.tree_.max_depth), 
             "max_features": ['sqrt', 'log2', None], 
             "splitter": ['best', 'random']}

dt_grid = GridSearchCV(estimator=dt, param_grid=dt_param_grid, scoring=None, cv=10)
dt_grid = dt_grid.fit(X_train, y_train)

dt_grid.best_params_

{'max_depth': 4, 'max_features': None, 'splitter': 'best'}

In [15]:
#random forest
rf_param_grid = {"bootstrap": [True, False], 
             "max_depth": range(1, dt.tree_.max_depth), 
             "max_features": [0.1, 0.2, 0.4, 0.6, 0.8, None], 
             "warm_start": [True, False]}

rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, scoring=None, cv=10)
rf_grid = rf_grid.fit(X_train, y_train)

rf_grid.best_params_

{'bootstrap': True, 'max_depth': 6, 'max_features': 0.6, 'warm_start': False}

### Model evaluation

In [16]:
#test model accuracy 

print("Decision Tree Train Accuracy:")
print(" ", dt_grid.score(X_train, y_train))
print("Decision Tree Test Accuracy:")
print(" ", dt_grid.score(X_test, y_test))

print("Random Forest Train Accuracy:")
print(" ", rf_grid.score(X_train, y_train))
print("Random Forest Test Accuracy:")
print(" ",rf_grid.score(X_test, y_test))

Decision Tree Train Accuracy:
  0.7928571428571428
Decision Tree Test Accuracy:
  0.7624309392265194
Random Forest Train Accuracy:
  0.85
Random Forest Test Accuracy:
  0.7458563535911602


### Conclusion

For this dataset random forest created a more accurate model for predicting if a person will have an affair.