In [24]:
# Imports and pip installations (if needed)
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures as poly
from sklearn.model_selection import ShuffleSplit
from sklearn.datasets import load_iris
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# Part 1: Load the Dataset

In [6]:
# Load the dataset (load remotely, not locally)
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris['data'],columns = iris.feature_names)
iris_df["Species"] = iris.target

iris_df.rename(columns = {'Speal Length':'0'})
# Output the first 15 rows of the data
iris_df.head(15)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [8]:
# Display a summary of the table information (number of datapoints, etc.)
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

# Part 2: Split the dataset into train and test

In [14]:
# Take the dataset and split it into our features (X) and label (y)
x,y = load_iris(return_X_y = True)

# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)


# Part 3: Logistic Regression

In [31]:
# i. Use sklearn to train a LogisticRegression model on the training set
logistic_reg = LogisticRegression(max_iter=1000,random_state=42).fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
sample = np.array([[5.4, 3.7, 1.5, 0.2]])
print(f'Probabilites of each classes: {logistic_reg.predict_proba(sample)}')
# iii. Report on the score for Logistic regression model, what does the score measure?
score = logistic_reg.score(X_test, y_test)
print(f'Score of the Logistic regression model: {score}')
# iv. Extract the coefficents and intercepts for the boundary line(s)
print(f'Coefficents: {logistic_reg.coef_}')
print(f'Intercept: {logistic_reg.intercept_}')

Probabilites of each classes: [[9.71327627e-01 2.86723321e-02 4.12295649e-08]]
Score of the Logistic regression model: 1.0
Coefficents: [[-0.39122448  0.93904954 -2.43765333 -1.02704598]
 [ 0.53482607 -0.39022414 -0.18049269 -0.97676208]
 [-0.14360159 -0.5488254   2.61814602  2.00380806]]
Intercept: [  9.3326014    2.33171215 -11.66431355]


The probabilites of each class is 97.1, 2.9 and 0.00000041

# Part 4: Support Vector Machine

In [34]:
# i. Use sklearn to train a Support Vector Classifier on the training set
SVC = svm.SVC(probability=True)
SVC.fit(X_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
print(f'Probabilites of each class: {SVC.predict_proba(sample)}')
# iii. Report on the score for the SVM, what does the score measure?
score = SVC.score(X_test, y_test)
print(f'Score of the Support Vector Classifier: {score}')

Probabilites of each classes: [[0.96810627 0.02053567 0.01135806]]
Score of the Support Vector Classifier: 1.0


The probabilites of each class is 96.8%, 2.1% and 1.1%


# Part 5: Neural Network

In [35]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
MLP = MLPClassifier(max_iter=600,random_state=0).fit(X_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
print(f'Probabilites of each class: {MLP.predict_proba(sample)}')

# iii. Report on the score for the Neural Network, what does the score measure?
score = MLP.score(X_test, y_test)
print(f'Score of the Neural Network: {score}')

# iv: Experiment with different options for the neural network, report on your best configuration 
#     (the highest score I was able to achieve was 0.8666)
MLP = MLPClassifier(max_iter=1000,random_state=42).fit(X_train, y_train)
print(f'New probabilites of each classes: {MLP.predict_proba(sample)}')
score = MLP.score(X_test, y_test)
print(f'Score of different options for the Neural Network: {score}')

Probabilites of each class: [[9.99253837e-01 7.46163311e-04 7.89636488e-12]]
Score of the Neural Network: 1.0
New probabilites of each classes: [[9.99428786e-01 5.71214112e-04 1.97122347e-14]]
Score of different options for the Neural Network: 1.0


The score for Neural network is 1.0 which mean it is 100% for classifier correctly 

# Part 6: K-Nearest Neighbors

In [39]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
KNN = KNeighborsClassifier().fit(X_train, y_train)
print(f'Probabilites of each classes: {KNN.predict_proba(sample)}')
# iii. Report on the score for kNN, what does the score measure?
score = KNN.score(X_test, y_test)
print(f'Score of the k-Neighbors Classifier: {score}')

Probabilites of each classes: [[1. 0. 0.]]
Score of the k-Neighbors Classifier: 1.0


The result is 100% belongs to the class. 
score function return the mean accuracy of features and label test. The score of that is 100 %


# Part 7: Conclusions and takeaways

So, The result of this notebook,  we did three different iris speices. By looking the dataset, we can tell teh result show us it is good model. 
