In [1]:
# Imports and pip installations (if needed)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# fucntion display_prob_predict is to display the probabilities predicted for each possible class

def display_prob_predict(prob_predict):
    setosa_prob_list = ['setosa', '%.5f'%prob_predict[0][0]]
    versicolor_prob_list = ['versicolor', '%.5f'%prob_predict[0][1]]
    virginica_prob_list = ['virginica', '%.5f'%prob_predict[0][2]]
    prob_sum_list = ['sum', prob_predict.sum()]

    df = pd.DataFrame([setosa_prob_list, versicolor_prob_list, virginica_prob_list, prob_sum_list], 
             columns=['Labels', 'Probabilities'])
    display(df)

# Part 1: Load the dataset

In [3]:
# Load the dataset (load remotely, not locally)

iris = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases/iris/iris.data', header=None)


In [4]:
# Output the first 15 rows of the data

iris.head(15)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [5]:
# Display a summary of the table information (number of datapoints, etc.)

iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       150 non-null    float64
 1   1       150 non-null    float64
 2   2       150 non-null    float64
 3   3       150 non-null    float64
 4   4       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


## About the dataset
### Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?



# Part 2: Split the dataset into train and test

In [6]:
# Take the dataset and split it into our features (X) and label (y)

# features (X)
X = iris.iloc[:, [0, 1, 2, 3]].values

# label (y)
y = iris.iloc[:, 4].values

In [7]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, test_size=0.1, random_state=1, shuffle = True)

# Part 3: Logistic Regression

In [8]:
# i. Use sklearn to train a LogisticRegression model on the training set

lr = LogisticRegression(random_state=1) # parameter C is of overfitting and regularization
lr.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [9]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample_datapoint = X[:1, :] # pick the first data point as the sample datapoint
prob_predict = lr.predict_proba(sample_datapoint)

display_prob_predict(prob_predict)

Unnamed: 0,Labels,Probabilities
0,setosa,0.98102
1,versicolor,0.01898
2,virginica,0.0
3,sum,1.0


In [10]:
# iii. Report on the score for Logistic regression model, what does the score measure?

lr_train_accuracy = lr.score(X_train, y_train)
print('Logistic Regression Training Accuracy:', '%.5f'%lr_train_accuracy)

lr_test_accuracy = lr.score(X_test, y_test)
print('Logistic Regression Training Accuracy:', '%.5f'%lr_test_accuracy)

Logistic Regression Training Accuracy: 0.97778
Logistic Regression Training Accuracy: 1.00000


## What does the score measure?

In [11]:
# iv. Extract the coefficents and intercepts for the boundary line(s)

coefficients = lr.coef_
intercepts = lr.intercept_ 

coeff_list = ['coefficents']
intercept_list = ['intercepts']

coeff_list.extend(coefficients[0])
coeff_list.extend(coefficients[1])
coeff_list.extend(coefficients[2])
intercept_list.extend(intercepts.tolist())

pd.set_option("display.precision", 5)
display(pd.DataFrame([coeff_list]))
display(pd.DataFrame([intercept_list]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,coefficents,-0.44135,0.85286,-2.4574,-1.005,0.55059,-0.31499,-0.17441,-0.93051,-0.10925,-0.53787,2.63181,1.93551


Unnamed: 0,0,1,2,3
0,intercepts,9.97994,1.85203,-11.83197


# Part 4: Support Vector Machine

In [12]:
# i. Use sklearn to train a Support Vector Classifier on the training set

svm_linear = SVC(kernel='linear', probability=True, random_state=1)
svm_linear.fit(X_train, y_train)

SVC(kernel='linear', probability=True, random_state=1)

In [13]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample_datapoint = X[:1, :] # pick the first data point as the sample datapoint
prob_predict = svm_linear.predict_proba(sample_datapoint)

display_prob_predict(prob_predict)

Unnamed: 0,Labels,Probabilities
0,setosa,0.97293
1,versicolor,0.01774
2,virginica,0.00933
3,sum,1.0


In [14]:
# iii. Report on the score for the SVM, what does the score measure?\

svm_linear_train_accuracy = svm_linear.score(X_train, y_train)
print('Logistic Regression Training Accuracy (linear kernel):', '%.5f'%svm_linear_train_accuracy)

svm_linear_test_accuracy = svm_linear.score(X_test, y_test)
print('Logistic Regression Training Accuracy (linear kernel):', '%.5f'%svm_linear_test_accuracy)

Logistic Regression Training Accuracy (linear kernel): 0.98519
Logistic Regression Training Accuracy (linear kernel): 1.00000


## What does the score measure?

# Part 5: Neural Network

In [15]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set

nn = MLPClassifier(activation='logistic', solver='lbfgs', hidden_layer_sizes=(15,), 
                    max_iter=600, random_state=1)
nn.fit(X_train, y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(15,), max_iter=600,
              random_state=1, solver='lbfgs')

In [16]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample_datapoint = X[:1, :] # pick the first data point as the sample datapoint
prob_predict = nn.predict_proba(sample_datapoint)

display_prob_predict(prob_predict)

Unnamed: 0,Labels,Probabilities
0,setosa,0.99986
1,versicolor,0.00014
2,virginica,0.0
3,sum,1.0


In [17]:
# iii. Report on the score for the Neural Network, what does the score measure?
# iv: Experiment with different options for the neural network, report on your best configuration 
# (the highest score I was able to achieve was 0.8666)

nn_training_accuracy = nn.score(X_train, y_train)
print('Neural Network Training Accuracy:', '%.5f'%nn_training_accuracy)

nn_testing_accuracy = nn.score(X_test, y_test)
print('Neural Network Testing Accuracy:', '%.5f'%nn_testing_accuracy)

Neural Network Training Accuracy: 1.00000
Neural Network Testing Accuracy: 1.00000


## What does the score measure?

# Part 6: K-Nearest Neighbors

In [18]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [19]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample_datapoint = X[:1, :] # pick the first data point as the sample datapoint
prob_predict = knn.predict_proba(sample_datapoint)

display_prob_predict(prob_predict)

Unnamed: 0,Labels,Probabilities
0,setosa,1.0
1,versicolor,0.0
2,virginica,0.0
3,sum,1.0


In [20]:
# iii. Report on the score for kNN, what does the score measure?

knn_training_accuracy = knn.score(X_train, y_train)
print('K-Nearest Neighbors Training Accuracy:', '%.5f'%knn_training_accuracy)

knn_testing_accuracy = knn.score(X_test, y_test)
print('K-Nearest Neighbors Testing Accuracy:', '%.5f'%knn_testing_accuracy)

K-Nearest Neighbors Training Accuracy: 1.00000
K-Nearest Neighbors Testing Accuracy: 1.00000


# Part 7: Conclusions and takeaways