In [1]:
# Imports:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import random as r

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)

csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

In [3]:
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Species']

In [4]:
df = pd.read_csv(csv_url, names = col_names)

In [5]:
# Output the first 15 rows of the data

df.head(15)

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [6]:
# Just want to see how many classses we are working with 

df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [7]:
# Display a summary of the table information (number of datapoints, etc.)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal_Length  150 non-null    float64
 1   Sepal_Width   150 non-null    float64
 2   Petal_Length  150 non-null    float64
 3   Petal_Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
df.shape

(150, 5)

In [9]:
df.isnull().sum()

Sepal_Length    0
Sepal_Width     0
Petal_Length    0
Petal_Width     0
Species         0
dtype: int64

In [10]:
df.duplicated().sum()

3

In [11]:
# found 3 duplicate data points, so remove them

df = df.drop_duplicates()

In [12]:
df.shape

(147, 5)

In [13]:
df.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
count,147.0,147.0,147.0,147.0
mean,5.856463,3.055782,3.780272,1.208844
std,0.8291,0.437009,1.759111,0.757874
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [14]:
# this function will be used repeatedly throughout this assignment
# it generates a sample data point by taking random values for each of the 4 features
# random values are generated form within the range of min and max for each feature

def generate_sample_datapoint():
    s_l_max = 7.9
    s_l_min = 4.3
    s_w_max = 4.4
    s_w_min = 2.0
    p_l_max = 6.9
    p_l_min = 1.0
    p_w_max = 2.5
    p_w_min = 0.1
    
    sample = []
    
    s_l = round(r.uniform(s_l_min, s_l_max), 2)
    sample.append(s_l)
    s_w = round(r.uniform(s_w_min, s_w_max), 2)
    sample.append(s_w)
    p_l = round(r.uniform(p_l_min, p_l_max), 2)
    sample.append(p_l)
    p_w = round(r.uniform(p_w_min, p_w_max), 2)
    sample.append(p_w)
    
    return sample

In [15]:
# testing the function

generate_sample_datapoint()

[4.41, 2.7, 6.73, 1.96]

### About the dataset:
- The features are sepal length, sepal width, petal length, and petal width
- The label is the species of iris
- There are 3 possible classifications (labels) of iris species: 
    - setosa
    - versicolor
    - virginica
- Each row is mapped to one of three possible types of irises


## Part 2: Split the dataset into train and test

In [16]:
df.columns

Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
       'Species'],
      dtype='object')

In [17]:
# create a list of feature columns

features = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 
            'Petal_Width']

In [18]:
# define my X(features) and y(the thing I am trying to predict)

X = df[features]
y = df['Species']

In [19]:
# Take the dataset and split it into our features (X) and label (y)
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1,random_state=42)

## Part 3: Logistic Regression

In [20]:
# i. Use sklearn to train a LogisticRegression model on the training set

lr = LogisticRegression(solver='lbfgs', max_iter=200)

In [21]:
# fit my model

lr.fit(X_train, y_train)

LogisticRegression(max_iter=200)

In [22]:
y_pred = lr.predict(X_test)

In [23]:
# ii. For a sample datapoint, predict the probabilities for each possible class

# create a sample datapoint using the function from earlier
sample = generate_sample_datapoint()
sample = [sample]

# predict on the sample
prediction = lr.predict(sample)
print('Prediction on sample:', prediction[0])

# predicted probabilites for each class for this sample
prediction_probas = lr.predict_proba(sample)
prediction_probas[0]

predict_columns = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
probs_df = pd.DataFrame(prediction_probas, columns=predict_columns)
probs_df

Prediction on sample: Iris-virginica




Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,3.5e-05,0.104954,0.895011


In [24]:
# iii. Report on the score for Logistic regression model, what does the score measure?

accuracy = lr.score(X_test, y_test)
print(accuracy)

0.8666666666666667


This score means that the ratio of correct predictions / total predictions. A score of 1.0 means that 100% of the data was predicted correctly. Hmmmmm...

In [25]:
print(metrics.classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.80      0.80      0.80         5
 Iris-virginica       0.80      0.80      0.80         5

       accuracy                           0.87        15
      macro avg       0.87      0.87      0.87        15
   weighted avg       0.87      0.87      0.87        15



In [26]:
# iv. Extract the coefficents and intercepts for the boundary line(s)

coefficient_values = lr.coef_
coefficient_values

array([[-0.47319118,  0.87943959, -2.47445836, -0.99907691],
       [ 0.42635048, -0.30015835, -0.2550637 , -0.82522932],
       [ 0.0468407 , -0.57928124,  2.72952206,  1.82430624]])

- predictions = model.predict(X)
- prediction = predictions[0]
- predicted_probas = model.predict_proba(X)
- predicted_proba = predicted_probas[0]
- percent_abusive = predicted_proba[0]
- percent_notAbusive = predicted_proba[1]

## Part 4: Support Vector Machine

In [27]:
from sklearn import svm

In [28]:
supvec = svm.SVC(max_iter=100, probability=True)

In [29]:
# i. Use sklearn to train a Support Vector Classifier on the training set

supvec.fit(X_train, y_train)

SVC(max_iter=100, probability=True)

In [30]:
# iii. Report on the score for the SVM, what does the score measure?

supvec.score(X_test, y_test)

0.8666666666666667

This score means that the model predicted correctly 100% of the time in the test set. 

In [31]:
y_pred = supvec.predict(X_test)

In [32]:
print(metrics.classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.80      0.80      0.80         5
 Iris-virginica       0.80      0.80      0.80         5

       accuracy                           0.87        15
      macro avg       0.87      0.87      0.87        15
   weighted avg       0.87      0.87      0.87        15



In [33]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample = generate_sample_datapoint()
sample = [sample]

In [34]:
prediction = supvec.predict(sample)



In [35]:
prediction[0]

'Iris-virginica'

In [36]:
prediction_probas = supvec.predict_proba(sample)

predict_columns = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
probs_df = pd.DataFrame(prediction_probas, columns=predict_columns)
probs_df



Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,0.013506,0.015195,0.971299


## Part 5: Neural Netowork

In [37]:
from sklearn.neural_network import MLPClassifier

In [38]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set

mlp_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3), random_state=1)

In [39]:
mlp_classifier.fit(X_train, y_train)     

MLPClassifier(alpha=1e-05, hidden_layer_sizes=3, random_state=1, solver='lbfgs')

In [40]:
mlp_classifier.score(X_train, y_train)

0.3409090909090909

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
predictions_train = mlp_classifier.predict(X_train)
predictions_test = mlp_classifier.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)

score on train data:  0.3409090909090909
score on test data:  0.3333333333333333


In [43]:
# ii. For a sample datapoint, predict the probabilities for each possible class

sample = generate_sample_datapoint()
sample = [sample]

In [44]:
prediction = mlp_classifier.predict(sample)



In [45]:
prediction[0]

'Iris-versicolor'

In [46]:
prediction_probas = mlp_classifier.predict_proba(sample)
prediction_probas[0]

predict_columns = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
probs_df = pd.DataFrame(prediction_probas, columns=predict_columns)
probs_df



Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,0.325756,0.340908,0.333337


In [47]:
# iii. Report on the score for the Neural Network, what does the score measure?

In [48]:
# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)

## Part 6: K-Nearest Neighbors

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [50]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier

In [51]:
# k_range = range(1, 26)
# scores = {}
# scores_list = []

In [52]:
# for k in k_range:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, y_train)
#     y_pred = knn.predict(X_test)
#     scores[k] = metrics.accuracy_score(y_test, y_pred)
#     scores_list.append(metrics.accuracy_score(y_test, y_pred))

In [53]:
#plt.plot(k_range, scores_list)

In [54]:
# Instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=3)


In [55]:
# Fitting the model
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [56]:
# ii. For a sample datapoint, predict the probabilities for each possible class

In [57]:
sample = generate_sample_datapoint()
sample = [sample]

In [58]:
classses = {0: 'Iris-setosa', 1: 'Iris-versilicolor', 2: 'Iris-virginica'}

In [59]:
y_predict = knn.predict(sample)



In [60]:
y_predict

array(['Iris-virginica'], dtype=object)

In [61]:
probas = knn.predict_proba(sample)



In [62]:
predict_columns = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
probs_df = pd.DataFrame(probas, columns=predict_columns)
probs_df

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,0.0,0.0,1.0


In [63]:
# iii. Report on the score for kNN, what does the score measure?

In [64]:
knn.score(X_test,y_test)  

0.8666666666666667