<a href="https://colab.research.google.com/github/angelsmreyes/Hands-on-machine-learning-with-scikit-learn-keras-and-tensorflow/blob/main/Iris_project_for_review_in_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [None]:
iris = datasets.load_iris(as_frame=True)

In [None]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [None]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [None]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### **Two ways to convert the dataset from scikit learn to a dataframe**

Using the argument as_frame=True in the the method load_iris()

In [None]:
iris.frame

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
df_method=iris.frame

In [None]:
df_method.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [None]:
df_method.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Long way

In [None]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [None]:
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [None]:
df['target'] = iris.target

In [None]:
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
df.shape

(150, 5)

In [None]:
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

In [None]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [None]:
# Predictors
X = df[['petal length (cm)', 'petal width (cm)']]

In [None]:
X

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [None]:
X.shape

(150, 2)

In [None]:
# Target
y = (df['target'] == 2).astype(np.float64) # Iris virginica

In [None]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    1.0
146    1.0
147    1.0
148    1.0
149    1.0
Name: target, Length: 150, dtype: float64

In [None]:
svm_clf = Pipeline([
              ('scaler', RobustScaler()),
              ('linear_svc', LinearSVC(C=1, loss='hinge'))      
])

In [None]:
svm_clf.fit(X, y)

Pipeline(steps=[('scaler', RobustScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [None]:
svm_clf.predict([[5.5, 1.7]])

  "X does not have valid feature names, but"


array([1.])

In [None]:
y_pred = svm_clf.predict([[5.5, 1.7]])

  "X does not have valid feature names, but"


In [None]:
y_pred

array([1.])

**Train test split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print('Train dimension: ', X_train.shape, y_train.shape)
print('-----------')
print('Test dimension: ', X_test.shape, y_test.shape)

Train dimension:  (120, 2) (120,)
-----------
Test dimension:  (30, 2) (30,)


In [None]:
svm_clf.fit(X_train, y_train)

Pipeline(steps=[('scaler', RobustScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [None]:
y_pred = svm_clf.predict(X_test)

In [None]:
y_pred

array([0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.])

**Model evaluation**

In [None]:
print(accuracy_score(y_test, y_pred))

0.9


In [None]:
svm_clf.score(X_test, y_test)

0.9

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93        20
         1.0       1.00      0.70      0.82        10

    accuracy                           0.90        30
   macro avg       0.93      0.85      0.88        30
weighted avg       0.91      0.90      0.89        30



In [None]:
metrics.confusion_matrix(y_test, y_pred)

array([[20,  0],
       [ 3,  7]])

For this case I think is more important the accuracy since it is not a binary classification, because we have three classes of iris flower