In [68]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import env 
from util import get_db_url
import acquire
import prepare
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import graphviz

Use Iris data set.

** Use my prep_iris function to pull in my prepared iris dataset.

In [2]:
df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [3]:
x = df[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']]
y = df[['Species']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.7, random_state=123)
x_train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
115,5.8,2.8,5.1,2.4
137,6.3,3.4,5.6,2.4
54,5.5,2.3,4.0,1.3
20,5.1,3.8,1.5,0.3
39,4.4,3.0,1.3,0.2


### Logistic Regression

1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [5]:
# Create the object
logit = LogisticRegression(random_state=123)
# Fit the object to the training data
logit.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [6]:
print('Coefficient:', logit.coef_)
print('Intercept:', logit.intercept_)

Coefficient: [[ 0.38421538  1.32718255 -2.11307588 -0.94269552]
 [ 0.43099717 -1.34596217  0.4506587  -1.07117492]
 [-1.517952   -1.52141607  2.26046444  2.12613123]]
Intercept: [ 0.25726194  0.58107381 -0.87235291]


Estimate which species it is.

In [7]:
y_pred = logit.predict(x_train)
y_pred

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'setosa', 'setosa', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'se

Estimate probability it is a species.

In [8]:
y_pred_proba = logit.predict_proba(x_train)
y_pred_proba

array([[8.82438477e-04, 2.27909291e-01, 7.71208271e-01],
       [9.04840755e-04, 1.89452140e-01, 8.09643019e-01],
       [1.61592108e-02, 6.53564224e-01, 3.30276565e-01],
       [8.89718868e-01, 1.10252519e-01, 2.86122251e-05],
       [8.02996773e-01, 1.96869865e-01, 1.33362269e-04],
       [4.01764007e-03, 3.66910909e-01, 6.29071451e-01],
       [8.27624301e-01, 1.72231115e-01, 1.44583818e-04],
       [7.68419373e-01, 2.31515414e-01, 6.52132008e-05],
       [2.05897310e-02, 7.67212016e-01, 2.12198253e-01],
       [1.73946587e-02, 6.62952984e-01, 3.19652358e-01],
       [2.43782930e-02, 6.69593647e-01, 3.06028060e-01],
       [1.28790640e-01, 8.21403635e-01, 4.98057249e-02],
       [1.16916979e-03, 3.32890868e-01, 6.65939962e-01],
       [4.72232958e-02, 8.47334030e-01, 1.05442674e-01],
       [1.54834217e-03, 2.44290941e-01, 7.54160717e-01],
       [6.75565848e-01, 3.24060721e-01, 3.73431196e-04],
       [9.74637491e-04, 3.03672885e-01, 6.95352477e-01],
       [4.98965898e-02, 7.39623

Model Score:

In [9]:
format(logit.score(x_train, y_train))

'0.9523809523809523'

Confusion Matrix:

In [10]:
print(confusion_matrix(y_train, y_pred))

[[32  0  0]
 [ 0 36  4]
 [ 0  1 32]]


Classification Report:

In [11]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.90      0.94        40
   virginica       0.89      0.97      0.93        33

    accuracy                           0.95       105
   macro avg       0.95      0.96      0.95       105
weighted avg       0.95      0.95      0.95       105



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
sorted(y_train.Species.unique())
labels = sorted(y_train.Species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,36,4
virginica,0,1,32


**Establish my true positive, false positive, true negative, false negative, precision, f1-score and support.**

In [55]:
cm = confusion_matrix(y_train, y_pred)
cm

# axis sums the column then take the diag out of it, everything other than that diag is a fp
# same applies to fn 
FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

**The default for the 'solver' parameter is 'liblinear'. It is better for smaller datasets, whereas 'sag' and 'saga' are better for larger datasets.**

5. Run through steps 2-4 using another solver (from question 5)

In [13]:
# Create the object
logit = LogisticRegression(solver='saga', random_state=123)
# Fit the object to the training data
logit.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
print('Coefficient:', logit.coef_)
print('Intercept:', logit.intercept_)

Coefficient: [[ 0.3389176   1.28789683 -2.11394313 -0.94719182]
 [ 0.39414558 -1.38109062  0.44038036 -1.02856616]
 [-1.4275007  -1.42459066  2.29209122  2.06589358]]
Intercept: [ 0.63153571  0.89381264 -1.78105991]


In [15]:
y_pred = logit.predict(x_train)
y_pred

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'setosa', 'setosa', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'se

In [16]:
y_pred_proba = logit.predict_proba(x_train)
y_pred_proba

array([[8.67435478e-04, 2.35498121e-01, 7.63634443e-01],
       [8.49600933e-04, 1.89809203e-01, 8.09341196e-01],
       [1.68970084e-02, 6.79014303e-01, 3.04088689e-01],
       [8.90674998e-01, 1.09297754e-01, 2.72485150e-05],
       [7.98513598e-01, 2.01377139e-01, 1.09263285e-04],
       [3.76174403e-03, 3.62859457e-01, 6.33378799e-01],
       [8.26275367e-01, 1.73594665e-01, 1.29967566e-04],
       [7.67130626e-01, 2.32811936e-01, 5.74384239e-05],
       [1.94116097e-02, 7.57332196e-01, 2.23256195e-01],
       [1.69466971e-02, 6.63546313e-01, 3.19506990e-01],
       [2.51800794e-02, 6.92193366e-01, 2.82626554e-01],
       [1.29063863e-01, 8.25508228e-01, 4.54279085e-02],
       [1.16188682e-03, 3.39547548e-01, 6.59290565e-01],
       [4.49767463e-02, 8.43034019e-01, 1.11989235e-01],
       [1.45979119e-03, 2.45077195e-01, 7.53463014e-01],
       [6.69370908e-01, 3.30345134e-01, 2.83958352e-04],
       [8.99995318e-04, 2.97905501e-01, 7.01194504e-01],
       [4.96681113e-02, 7.46365

In [17]:
format(logit.score(x_train, y_train))

'0.9619047619047619'

In [18]:
print(confusion_matrix(y_train, y_pred))

[[32  0  0]
 [ 0 37  3]
 [ 0  1 32]]


In [19]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.97      0.93      0.95        40
   virginica       0.91      0.97      0.94        33

    accuracy                           0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



In [20]:
sorted(y_train.Species.unique())
labels = sorted(y_train.Species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,37,3
virginica,0,1,32


**Test the model:**

In [66]:
print('Test data model score:', format(logit.score(x_test, y_test)))

Test data model score: 0.9777777777777777


6. Which performs better on your in-sample data?

**The solver 'saga' performs better with the in-sample data.**

### Decision Tree 

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [20]:
df_iris = data('iris')
df_iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [21]:
df_iris.columns=[col.lower().replace('.', '_')for col in df_iris]
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [22]:
x = df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df_iris[['species']]

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.7, random_state=123)
print(x_train.head())
print(y_train.head())

     sepal_length  sepal_width  petal_length  petal_width
115           5.8          2.8           5.1          2.4
137           6.3          3.4           5.6          2.4
54            5.5          2.3           4.0          1.3
20            5.1          3.8           1.5          0.3
39            4.4          3.0           1.3          0.2
        species
115   virginica
137   virginica
54   versicolor
20       setosa
39       setosa


Create my object:

In [24]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
tree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

Fit the model:

In [25]:
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

Estimate the species:

In [26]:
y_pred = tree.predict(x_train)
y_pred[0:5]
# This is done to just look at the first row of predictions

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa'],
      dtype=object)

Estimate the probabilities of species:

In [27]:
y_pred_proba = tree.predict_proba(x_train)
y_pred_proba

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [0.

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

Model Score:

In [28]:
format(tree.score(x_train, y_train))

'0.9809523809523809'

Confusion Matrix:

In [29]:
confusion_matrix(y_train, y_pred)

array([[32,  0,  0],
       [ 0, 40,  0],
       [ 0,  2, 31]])

In [30]:
sorted(y_train.species.unique())

['setosa', 'versicolor', 'virginica']

In [31]:
y_train.species.value_counts()

versicolor    40
virginica     33
setosa        32
Name: species, dtype: int64

In [33]:
labels = sorted(y_train.species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,40,0
virginica,0,2,31


Classification Report:

In [34]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      1.00      0.98        40
   virginica       1.00      0.94      0.97        33

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps 2-4 using entropy as your measure of impurity.

In [36]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)
tree

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [37]:
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [38]:
y_pred = tree.predict(x_train)
y_pred[0:5]

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa'],
      dtype=object)

In [39]:
y_pred_proba = tree.predict_proba(x_train)
y_pred_proba

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [0.

In [40]:
format(tree.score(x_train, y_train))

'0.9809523809523809'

In [41]:
confusion_matrix(y_train, y_pred)

array([[32,  0,  0],
       [ 0, 40,  0],
       [ 0,  2, 31]])

In [42]:
sorted(y_train.species.unique())

['setosa', 'versicolor', 'virginica']

In [43]:
y_train.species.value_counts()

versicolor    40
virginica     33
setosa        32
Name: species, dtype: int64

In [44]:
labels = sorted(y_train.species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,32,0,0
versicolor,0,40,0
virginica,0,2,31


In [45]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      1.00      0.98        40
   virginica       1.00      0.94      0.97        33

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



5. Which performs better on your in-sample data?

**There is no major difference between the gini and entropy when I run my model.**

Visualize the accuracy:

In [69]:
## need to install graphviz to anaconda
## example: 

from sklearn.datasets import load_iris

# iris = load_iris()
# clf = DecisionTreeClassifier()
# clf = clf.fit(iris.data, iris.target)

# import graphviz

# from graphviz import Graph

# dot_data = export_graphviz(clf, out_file=None) 
# graph = graphviz.Source(dot_data) 

# graph.render('iris_decision_tree', view=True)

### Random Forest

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?