### Support Vector Machine - Multi class classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import joblib

In [2]:
# Import the dataset
from sklearn.datasets import load_iris

In [3]:
# Load the dataset
data = load_iris()

In [4]:
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [5]:
# This is a 2D array, we need to create a DataFrame out of this dataset
df = pd.DataFrame(data.data, columns=data.feature_names)

In [6]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
# We need to concatinate the target variable to the dataframe we have created. 
df['target'] = data.target

In [9]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [10]:
# Let's see the names of the target variables
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [11]:
# Let's create one more column with target variable names
# Do the mapping
df['target'].map({i:name for i, name in enumerate(data.target_names)})

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: target, Length: 150, dtype: object

In [12]:
df['target_names'] = df['target'].map({i:name for i,name in enumerate(data.target_names)})

In [13]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


### Exploraroty Data Analysis

In [14]:
# To get the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
 5   target_names       150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [15]:
# check the null value if any
df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
target_names         0
dtype: int64

In [16]:
# Let's take the count of all the target variable
df['target'].value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

In [17]:
# To see them with names
df['target_names'].value_counts()

target_names
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

### ML Model creation steps

- As there is no feature engineering required we need to perform the model creation steps. 
- We need to follow th below steps to create the model
    - Step1: Create x(indipendent) and y(dependent) variables
        - Note: Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>
    - Step2: Create train and test data for both x and y variables
    - Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
        - For x_train: do the "`scaler.fit_transform(x_train)`"
        - for x_test: do the "`scaler.transform(x_test)`" --  Because we dont want the model to know about my testing dataset
    - Step4: Create the model object `svm_binary = SVC(kernel = 'rbf', C = 1.0, probability = True)`
    - Step5: Create the model by passing the "x_train_scaled" and "y_train" data to the model
    - Step6: Predict the output of an unseen data using the model we have build
    - Step7: Test the model using various metrices
    - Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset
    

#### Multiclass classification

#### Step1: Create x(indipendent) and y(dependent) variables

- Note: 
Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>

In [18]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [19]:
x_multi = df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]

In [20]:
y_multi = df['target']

##### Step2: Create train and test data for both x and y variables

In [21]:
x_train_multi, x_test_multi, y_train_multi, y_test_multi = train_test_split(x_multi, y_multi, test_size=0.2, random_state=12)

##### Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
- For x_train: do the "`scaler.fit_transform(x_train_multi)`"
- for x_test: do the "`scaler.transform(x_test_multi)`" --  Because we dont want the model to know about my testing dataset

In [22]:
scaler = StandardScaler()
x_train_multi_scaled = scaler.fit_transform(x_train_multi)
x_test_multi_scaled = scaler.transform(x_test_multi)

In [23]:
x_train_multi_scaled

array([[-9.23010404e-01,  9.92828352e-01, -1.37449891e+00,
        -1.19654889e+00],
       [-4.32373645e-01, -1.45676683e+00, -5.72707879e-02,
        -2.75283018e-01],
       [ 5.48899875e-01, -3.43314477e-01,  1.03087418e+00,
         7.77592269e-01],
       [-4.32373645e-01,  2.55166165e+00, -1.37449891e+00,
        -1.32815830e+00],
       [-6.43960747e-02, -7.88695420e-01,  7.44520243e-01,
         9.09201679e-01],
       [-1.87055265e-01,  2.99704259e+00, -1.31722812e+00,
        -1.06493948e+00],
       [-3.09714455e-01, -3.43314477e-01, -1.14541576e-01,
         1.19545215e-01],
       [-4.32373645e-01, -1.45676683e+00, -2.54333389e-16,
        -1.43673607e-01],
       [-6.43960747e-02,  2.10628071e+00, -1.48904049e+00,
        -1.32815830e+00],
       [ 1.80922305e-01, -1.90214778e+00,  1.14541576e-01,
        -2.75283018e-01],
       [-9.23010404e-01, -1.23407636e+00, -4.58166303e-01,
        -1.43673607e-01],
       [-1.04566959e+00,  7.70137880e-01, -1.25995733e+00,
      

In [24]:
x_test_multi_scaled

array([[-1.04566959,  0.99282835, -1.4317697 , -1.19654889],
       [ 0.54889988, -1.23407636,  0.68724945,  0.90920168],
       [-1.78162473, -0.12062401, -1.4317697 , -1.3281583 ],
       [-0.18705526, -0.56600495,  0.17181236,  0.11954521],
       [ 1.16219582,  0.32475694,  1.20268655,  1.43563932],
       [ 0.79421825, -0.12062401,  0.97360339,  0.77759227],
       [ 0.42624069,  0.77013788,  0.91633261,  1.43563932],
       [-0.9230104 ,  0.77013788, -1.31722812, -1.3281583 ],
       [ 2.26612853, -0.12062401,  1.31722812,  1.43563932],
       [-0.9230104 ,  0.54744741, -1.20268655, -0.93333007],
       [-0.30971445, -0.12062401,  0.40089552,  0.38276404],
       [-0.55503283,  1.88359024, -1.4317697 , -1.06493948],
       [-0.55503283,  0.77013788, -1.31722812, -1.06493948],
       [-0.9230104 ,  1.43820929, -1.31722812, -1.06493948],
       [ 1.4075142 ,  0.32475694,  0.51543709,  0.25115463],
       [ 1.65283258,  0.32475694,  1.25995733,  0.77759227],
       [ 1.03953663, -1.

##### Step4: Create the model object `svm_binary = SVC(kernel = 'rbf', C = 1.0, probability = True)`

In [25]:
svm_multi = SVC(kernel="linear")

#### Step5: Create the model by passing the "x_train_scaled" and "y_train" data to the model


In [26]:
svm_multi.fit(x_train_multi, y_train_multi)

##### Step6: Predict the output of an unseen data using the model we have build

In [27]:
svm_multi.predict(x_test_multi)

array([0, 2, 0, 1, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 1, 2, 2, 1, 0, 1, 0, 1,
       2, 1, 0, 2, 1, 1, 0, 0])

##### Step7: Test the model using various metrices

In [28]:
print("Classification report: \n",classification_report(y_test_multi, svm_multi.predict(x_test_multi)))

Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         9

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [29]:
confusion_matrix(y_test_multi, svm_multi.predict(x_test_multi))

array([[12,  0,  0],
       [ 0,  9,  0],
       [ 0,  0,  9]])

##### Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset
    

In [30]:
joblib.dump((svm_multi, scaler), "svm_multi.pkl")

['svm_multi.pkl']