In [1]:
# Imports and pip installations (if needed)
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from IPython.display import Markdown as md
from sklearn.datasets import load_iris

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)
iris_data = load_iris()

features = iris_data.data
target =  iris_data.target

In [3]:
# Output the first 15 rows of the data
df_features = pd.DataFrame(features, columns=iris_data.feature_names)
df_target = pd.DataFrame(target, columns=['target'])

df = pd.concat([df_features, df_target], axis=1)
df.head(15)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [4]:
# Display a summary of the table information (number of datapoints, etc.)
display(df.describe())
display(df.info())

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
target               150 non-null int64
dtypes: float64(4), int64(1)
memory usage: 5.9 KB


None

## About the dataset

Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?


In [5]:
print("Feature Names : ",iris_data.feature_names)
print("Target Names  : ",iris_data.target_names)

Feature Names :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target Names  :  ['setosa' 'versicolor' 'virginica']


> After exploring the iris dataset, it can be observed that there are 4 features (4 columns)
> 1. Septal Length (cm)
> 2. Septal Width  (cm)
> 3. Petal Length  (cm)
> 4. Petal Width   (cm)

> The label in the dataset divides into species classes (3 categories) -
> where the values 0,1,2 correspond to the following different species
> 1. Iris setosa (value = 0)
> 2. Iris versicolor (value = 1)
> 3. Iris virginica (value = 2)


# Part 2: Split the dataset into train and test


In [6]:
# Take the dataset and split it into our features (X) and label (y)
X = features
y = target

# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)


# Part 3: Logistic Regression


In [7]:
# i. Use sklearn to train a LogisticRegression model on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
# ii. For a sample datapoint, predict the probabilities for each possible class
sample_dataset = np.array([[3,1,1,2]])
model.predict_proba(sample_dataset)

array([[0.27017898, 0.24918983, 0.4806312 ]])

In [19]:
# iii. Report on the score for Logistic regression model, what does the score measure?
# Here is the classification report for the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         3

   micro avg       1.00      1.00      1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



In [26]:
#Here is the confusion matrix
print("Confusion Matrix : ",confusion_matrix(y_test, y_pred))
print("\n")
print("Score : ", model.score(X_test, y_test))

Confusion Matrix :  [[6 0 0]
 [0 6 0]
 [0 0 3]]


Score :  1.0


#### Report on the score:
> I ran a classification report on the model which determined precision of the model. The precision is 1.0 which means our model predicts correctly 100% of the times and the score is also 1.0 which means our results are accurate 100% of the times when Logisitic Regression Classifier is used to predict values


> I also printed confusion matrix for this model and here are the results:
> * 6 out of 6 True positives for setosa
> * 6 out of 6 True positives for versicolor
> * 3 out of 3 True positives for virginica

In [14]:
# iv. Extract the coefficents and intercepts for the boundary line(s)
print("Coefficients : ", model.coef_)
print("\n")
print("Intercepts: ", model.intercept_)

Coefficients :  [[ 0.39031289  1.4402029  -2.20275823 -0.98006197]
 [ 0.45354842 -1.57084924  0.51006305 -1.31515818]
 [-1.59624782 -1.65120099  2.47051565  2.31572047]]


Intercepts:  [ 0.26063178  0.93734277 -1.13607679]



# Part 4: Support Vector Machine


In [None]:
# i. Use sklearn to train a Support Vector Classifier on the training set

# ii. For a sample datapoint, predict the probabilities for each possible class

# iii. Report on the score for the SVM, what does the score measure?

#  Part 5: Neural Network


In [None]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set

# ii. For a sample datapoint, predict the probabilities for each possible class

# iii. Report on the score for the Neural Network, what does the score measure?

# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)

# Part 6: K-Nearest Neighbors
 

In [None]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier

# ii. For a sample datapoint, predict the probabilities for each possible class

# iii. Report on the score for kNN, what does the score measure?

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?
