In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
from sklearn import tree
import matplotlib.pyplot as plt
from helper import plot_boundary
from prettytable import PrettyTable
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 20)
plt.rcParams["figure.figsize"] = (12,8)

In [None]:
# Read the data file "election_train.csv" as a Pandas dataframe
elect_train = pd.read_csv("election_train.csv")

# Read the data file "election_test.csv" as a Pandas dataframe
elect_test = pd.read_csv("election_test.csv")

# Take a quick look at the train data
elect_train.head()

	state	fipscode	county	population	hispanic	minority	female	unemployed	income	nodegree	bachelor	inactivity	obesity	density	cancer	trump	clinton	votergap	won
0	Alabama	1001	Autauga County	50756	2.842	22.733	51.475	5.2	54366	13.8	21.9	28.6	34.1	91.8	186.5	73.436	23.957	49.479	1
1	Alabama	1003	Baldwin County	179878	4.550	12.934	51.261	5.5	49626	11.0	28.6	22.3	27.4	114.6	229.4	77.351	19.565	57.786	1
2	Alabama	1007	Bibb County	21587	2.409	23.930	46.110	6.6	39546	22.1	10.2	33.9	40.3	36.8	230.3	76.966	21.422	55.544	1
3	Alabama	1009	Blount County	58345	8.954	4.229	50.592	5.4	45567	21.9	12.3	28.0	34.6	88.9	205.3	89.852	8.470	81.382	1
4	Alabama	1011	Bullock County	10985	7.526	72.831	45.241	7.8	26580	34.5	14.1	31.7	43.0	17.5	211.2	24.229	75.090	-50.862	0

In [None]:
### edTest(test_data) ###
# Set the columns minority and bachelor as train data predictors
X_train = elect_train[['minority', 'bachelor']]

# Set the columns minority and bachelor as test data predictors
X_test = elect_test[['minority', 'bachelor']]

# Set the column "won" as the train response variable
y_train = elect_train['won']

# Set the column "won" as the test response variable
y_test = elect_test['won']

In [None]:
### edTest(test_models) ###

# Initialize a Decision Tree classifier with a depth of 2
dt1 = DecisionTreeClassifier(max_depth=2)

# Fit the classifier on the train data
dt1.fit(X_train, y_train)

# Initialize a Decision Tree classifier with a depth of 10
dt2 = DecisionTreeClassifier(max_depth=10)

# Fit the classifier on the train data
dt2.fit(X_train, y_train)

In [None]:
# Call the function plot_boundary from the helper file to get 
# the decision boundaries of both the classifiers
plot_boundary(elect_train, dt1, dt2)

In [None]:
# Set of predictor columns
pred_cols = ['minority', 'density','hispanic','obesity','female','income','bachelor','inactivity']

# Use the columns above as the features to 
# get the predictor set from the train data
X_train = elect_train[pred_cols]

# Use the columns above as the features to 
# get the predictor set from the test data
X_test = elect_test[pred_cols]

# Initialize a Decision Tree classifier with a depth of 2
dt1 = DecisionTreeClassifier(max_depth=2)

# Initialize a Decision Tree classifier with a depth of 10
dt2 = DecisionTreeClassifier(max_depth=10)

# Initialize a Decision Tree classifier with a depth of 15
dt3 = DecisionTreeClassifier(max_depth=15)

# Fit all the classifier on the train data
dt1.fit(X_train, y_train)
dt2.fit(X_train, y_train)
dt3.fit(X_train, y_train)

In [None]:
### edTest(test_accuracy) ###

# Compute the train and test accuracy for the first decision tree classifier of depth 2
dt1_train_acc = dt1.score(X_train, y_train)
dt1_test_acc = dt1.score(X_test, y_test)

# Compute the train and test accuracy for the second decision tree classifier of depth 10
dt2_train_acc = dt2.score(X_train, y_train)
dt2_test_acc = dt2.score(X_test, y_test)

# Compute the train and test accuracy for the third decision tree classifier of depth 15
dt3_train_acc = dt3.score(X_train, y_train)
dt3_test_acc = dt3.score(X_test, y_test)

In [None]:
# Helper code to plot the scores of each classifier as a table
pt = PrettyTable()
pt.field_names = ['Max Depth', 'Number of Features', 'Train Accuracy', 'Test Accuracy']
pt.add_row([2, len(pred_cols), round(dt1_train_acc, 4), round(dt1_test_acc,4)])
pt.add_row([10, len(pred_cols), round(dt2_train_acc,4), round(dt2_test_acc,4)])
pt.add_row([15, len(pred_cols), round(dt3_train_acc,4), round(dt3_test_acc,4)])
print(pt)

+-----------+--------------------+----------------+---------------+
| Max Depth | Number of Features | Train Accuracy | Test Accuracy |
+-----------+--------------------+----------------+---------------+
|     2     |         8          |     0.8924     |     0.8862    |
|     10    |         8          |     0.9866     |     0.9126    |
|     15    |         8          |     0.9996     |     0.9004    |
+-----------+--------------------+----------------+---------------+