In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Decision Tree                     ###
### Date: 05/24/2018                           ###
##################################################

# https://medium.com/@haydar_ai/learning-data-science-day-21-decision-tree-on-iris-dataset-267f3219a7fa
    
######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np 

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp

### sklearn
# Sklearn contains basic statistical models
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
# As well as a module to calculate model performance statistics
from sklearn import metrics, model_selection, tree

### Statsmodels
# Sklearn contains basic statistical models and data sets
import statsmodels.api as sm

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### String
# Allows for more flexible solutions for dealing with string characters
import string as st

In [None]:
######################################################################
########                     Import Data                      ########
######################################################################

### Iris Data
# load dataset from statsmodel modules
iris = load_iris()

### We need to add a dummy variable to indicate whether of not an affair occurred
### 1 indicates having an affair, 0 represents not
features = iris.feature_names
print('Iris Feature Names in ndarry')
print(features)

######################################################################
########                  Data Expoloration                   ########
######################################################################

### If iris was not a numpy ndarray we could rename the columns in the following manner
# x = x.rename(columns = {'sepal length (cm)':'sepal_length',
#                        'sepal width (cm)':'sepal_width',
#                        'petal length (cm)':'petal_length',
#                        'petal width (cm)':'petal_width'})

### Since iris IS a numpy ndarry we must first make it a dataframe 
df = pd.DataFrame(data = iris.data, columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
y = pd.DataFrame(data = iris.target, columns = ['species'])

### Let's determine how the data types look for this data frame
print()
print("Iris Feature Names in Data Frame")
for k in df.keys():
    print(k)

print()
print("Are there any null values in our data frame?")
df.isnull().any()

print()
print("What are the data types of Iris' Features?")
print(df.dtypes)

print()
print("Summary of Iris Data")
print(df.describe())

### petal_width has a very large range between the maximum and minimum, let's view this
df['petal_width'].plot.hist()
plt.show()

### Does not seem to be too odd given that it is probably a characteristic of the flower specied
### We should proceed

In [None]:
######################################################################
########                  Data Manipulation                   ########
######################################################################

### Species is currently an integer representation of the species, let's replace those values
y = y.replace(0, 'setosa')
y = y.replace(1, 'versicolor')
y = y.replace(2, 'virginica')

### We can join our independent and dependent variables together to make a complete data frame now
iris = df.join(y)
print(iris.head(3))
print()
######## How the Features Interact ######## 
sns.pairplot(iris, hue = 'species')

In [None]:
######################################################################
########                  Model Preparation                   ########
######################################################################

### Split the data set into dependent and independent variables
x = iris.drop(['species'], axis = 1).values
y = iris['species'].values

### Split the data set into training and testing data sets; 70 - 30 split
### you can use the argument 'random_state = BINARY' to decide if you want the test and train to be random or reproducable
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.3)

In [None]:
######################################################################
########                     Modeling                         ########
######################################################################

### First we will create a Decision Tree object to use in our model
dtc = DecisionTreeClassifier()

### Then we will train the Decision Tree model using our training data set
dtc.fit(x_train, y_train)

### Lastly, we will score the model using our testing data set
print("Decision Tree Accuracy: " + str((dtc.score(x_test, y_test) * 100).round(2)) + "%")

In [None]:
######################################################################
########                  Model Validation                    ########
######################################################################

### The shuffle validator will apply the same 70-30 split as before, but generates 20 unique permutations of this split
### By passing the shuffle validator as a parameter to the cross_val_score function we can score our classifer against each
### of the different splits and compute their accuracy
shuffle_validator = cross_validation.ShuffleSplit(len(x), n_iter = 20, test_size = 0.3, train_size = 0.7, random_state = 0)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, x, y, cv = shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

### Thus, no matter how we split the data we will recieve this % of correctly predicted survivals and the following 
### standard deviation 
test_classifier(dtc)

In [None]:
### Evaluate the model using 10-fold cross-validation
scores = cross_val_score(DecisionTreeClassifier(), x, y, scoring = 'accuracy', cv = 10)
print('Cross Validation Scores')
print((scores * 100).round(0))
print()
print('Average Scores')
print(str((scores.mean() * 100).round(0)) + '%')