In [1]:
import numpy as np
import random
np.random.seed(1337)
random.seed(1337)

In [2]:
# Plotting support
from matplotlib import pyplot as plt
from plotnine import *
# Standard libraries
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.metrics import mean_squared_error

## Regress

In [3]:
#read dataset
data = pd.read_csv('elections_clean.csv')

#retrieve wanted data
categorical_data = data[['State','Education','Religion','EthnicMale','EthnicFemale','PerCapitaInc']]
target_data = data[['PovertyAllAgesPct2014']]

# take mean of povery level, if below than 0, if higher than 1
li_poverty = data['PovertyAllAgesPct2014']
p_mean = np.mean(li_poverty)
target_list = []
for i in li_poverty:
  if i<p_mean:
    target_list.append(0)
  if i>p_mean:
    target_list.append(1)

temp = {'PovertyAllAgesPct2014' : target_list}
target = pd.DataFrame(data=temp)

#use pd.get_dummies to split categorical data
df = pd.get_dummies(categorical_data)

#split train and validation data
categorical_train, categorical_validation, target_train, target_validation = train_test_split(df, target, test_size=0.3, random_state=42)

#decision tree that is entropy based
tree = DecisionTreeClassifier(random_state=1, criterion="entropy")
tree = tree.fit(categorical_train,target_train)

#predict
predict_train = tree.predict(categorical_train) 
predict_test = tree.predict(categorical_validation) 

#maximum depth
max_depth = tree.get_depth()

#train errors 
train_errors = mean_squared_error(target_train, predict_train)
#validation errors
validation_errors = mean_squared_error(target_validation, predict_test)

print('max depth of the tree is: ' ,max_depth)
print('')

print("train error: ", train_errors)

print('')
print("validation error: ",validation_errors)

max depth of the tree is:  35

train error:  0.0

validation error:  0.2065677966101695


## Cross-validate

In [4]:
#retrieve wanted data
d = data[['State','Education','Religion','EthnicMale','EthnicFemale','PerCapitaInc', 'PovertyAllAgesPct2014']]
label_vec = d['PovertyAllAgesPct2014']

# take mean of povery level, if below than 0, if higher than 1
sp_mean = np.mean(label_vec)
t_list = []
for i in li_poverty:
  if i<p_mean:
    t_list.append(0)
  if i>p_mean:
    t_list.append(1)

d.drop(['PovertyAllAgesPct2014'], axis=1)
d = d.assign(PovertyAllAgesPct2014 = t_list)

#split data into 5 equal parts with random rows
temp = d.sample(frac=1)
split_d = np.array_split(temp, 5)

rank = []

for i in range(len(split_d)):
  if i == 4:
    break
  train = split_d[i][['State','Education','Religion','EthnicMale','EthnicFemale','PerCapitaInc']]
  labels = split_d[i]['PovertyAllAgesPct2014']
  #use pd.get_dummies to split categorical data
  t = pd.get_dummies(train)
  #split train and validation data
  categorical_train, categorical_validation, target_train, target_validation = train_test_split(t, labels, test_size=0.3, random_state=42)
  tree = DecisionTreeClassifier(random_state=1, criterion="entropy")
  tree.fit(categorical_train,target_train)
  #maximum depth
  max_depth = tree.get_depth()
  print('max depth of the regression tree', i+1, 'is: ' ,max_depth)
  print(' accuracy of regression tree', i+1, 'is: ', tree.score(categorical_validation, target_validation))
  rank.append(tree.score(categorical_validation, target_validation))
  print('')

max_value = max(rank)
index = rank.index(max_value)

print('The best regress tree is: ', index+1)


max depth of the regression tree 1 is:  20
 accuracy of regression tree 1 is:  0.7407407407407407

max depth of the regression tree 2 is:  26
 accuracy of regression tree 2 is:  0.7619047619047619

max depth of the regression tree 3 is:  19
 accuracy of regression tree 3 is:  0.7566137566137566

max depth of the regression tree 4 is:  18
 accuracy of regression tree 4 is:  0.8253968253968254

The best regress tree is:  4
