### Logistic Regression Multi class classification - OVR

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import joblib


In [6]:
# Import the dataset
from sklearn.datasets import load_iris

In [7]:
# Save the data in the variable
data = load_iris()

In [8]:
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [9]:
# This is a 2D array, we need to create a DataFrame out of it
df = pd.DataFrame(data.data, columns=data.feature_names)

In [10]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [11]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
df['target'] = data.target

In [13]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [14]:
# Let's see the names of the target variables
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [15]:
# Let's create one more column with target variable names
# we need to do a mapping
df['target'].map({i:name for i,name in enumerate(data.target_names)})

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: target, Length: 150, dtype: object

In [16]:
df['target_names'] = df['target'].map({i:name for i,name in enumerate(data.target_names)})

In [17]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


### ML Model creation steps

- As there is no feature engineering required we need to perform the model creation steps. 
- We need to follow th below steps to create the model
    - Step1: Create x(indipendent) and y(dependent) variables
        - Note: Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>
    - Step2: Create train and test data for both x and y variables
    - Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
        - For x_train: do the "`scaler.fit_transform(x_train)`"
        - for x_test: do the "`scaler.transform(x_test)`" --  Because we dont want the model to know about my testing dataset
    - Step4: Create the model object 
    - Step5: Create the model by passing the "x_train_scaled" and "y_train" data to the model
    - Step6: Predict the output of an unseen data using the model we have build
    - Step7: Test the model using various metrices
    - Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset
    

#### Step1: Create x(indipendent) and y(dependent) variables

- Note: 
Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>

In [18]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [19]:
x_multi = df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]

In [20]:
y_multi = df['target']

##### Step2: Create train and test data for both x and y variables

In [21]:
x_train_multi, x_test_multi, y_train_multi, y_test_multi = train_test_split(x_multi, y_multi, test_size=0.2)

##### Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
- For x_train: do the "`scaler.fit_transform(x_train_multi)`"
- for x_test: do the "`scaler.transform(x_test_multi)`" --  Because we dont want the model to know about my testing dataset

In [22]:
scaler = StandardScaler()
x_train_multi_scaled = scaler.fit_transform(x_train_multi)
x_test_multi_scaled = scaler.transform(x_test_multi)

In [23]:
x_train_multi_scaled

array([[-0.33570511,  1.0149715 , -1.30521323, -1.22678267],
       [-0.45777969, -0.1297332 ,  0.49767123,  0.49889899],
       [ 0.88504074, -0.1297332 ,  1.06107262,  0.89713323],
       [-1.67852554,  0.32814868, -1.30521323, -1.22678267],
       [ 1.37333908,  0.09920774,  0.83571207,  1.56085694],
       [-1.06815262, -1.50337884, -0.17841044, -0.16482472],
       [-0.21363052, -0.58761508,  0.72303179,  1.16262271],
       [-1.1902272 ,  0.09920774, -1.13619281, -1.22678267],
       [ 0.15259323, -0.1297332 ,  0.32865081,  0.49889899],
       [-0.45777969,  1.93073526, -1.07985267, -0.96129318],
       [ 0.3967424 , -0.58761508,  0.21597053,  0.23340951],
       [-0.33570511, -1.04549696,  0.44133109,  0.10066476],
       [ 1.37333908,  0.09920774,  1.00473248,  1.29536746],
       [-0.33570511, -1.73231978,  0.21597053,  0.23340951],
       [ 1.2512645 , -0.1297332 ,  1.06107262,  1.29536746],
       [-0.57985428,  1.47285338, -1.19253295, -1.22678267],
       [ 0.64089157, -0.

In [24]:
x_test_multi_scaled

array([[ 1.12918991,  0.09920774,  1.11741276,  1.69360169],
       [ 2.34993576, -0.1297332 ,  1.39911346,  1.56085694],
       [-0.70192886, -0.81655602,  0.15963039,  0.36615425],
       [ 0.64089157,  0.55708962,  1.34277332,  1.82634643],
       [ 0.51881699,  0.78603056,  1.00473248,  1.56085694],
       [ 0.15259323,  0.32814868,  0.66669165,  0.89713323],
       [ 1.12918991,  0.09920774,  0.44133109,  0.36615425],
       [-0.45777969,  1.93073526, -1.30521323, -0.96129318],
       [-1.43437637,  1.24391244, -1.47423365, -1.22678267],
       [-0.09155594, -0.58761508,  0.27231067,  0.23340951],
       [-1.31230179,  0.32814868, -1.30521323, -1.22678267],
       [ 0.03051865, -0.58761508,  0.83571207,  1.69360169],
       [ 2.34993576,  1.70179432,  1.7371543 ,  1.4281122 ],
       [ 1.00711533, -0.35867414,  0.55401137,  0.23340951],
       [-0.94607803,  1.0149715 , -1.30521323, -1.09403793],
       [ 0.3967424 , -0.58761508,  0.61035151,  0.10066476],
       [ 1.12918991,  0.

##### Step4: Create the LogisticRegression Multi Class model object using OVR

In [25]:
logistic_ovr = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=2000)

##### Step5: Create the model by passing the "x_train_multi_scaled" and "y_train_multi" data to the model

In [26]:
logistic_ovr.fit(x_train_multi_scaled, y_train_multi)



##### Step6: Predict the output of an unseen data using the model we have build

In [27]:
logistic_ovr.predict(x_test_multi_scaled)

array([2, 2, 1, 2, 2, 2, 1, 0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 1, 1, 2, 1, 1,
       0, 2, 1, 2, 2, 2, 1, 2])

##### Step7: Test the model using various metrices

In [28]:
print("Logistic Regression Classification Report:\n",classification_report(y_test_multi, logistic_ovr.predict(x_test_multi_scaled)))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.85      0.92        13
           2       0.86      1.00      0.92        12

    accuracy                           0.93        30
   macro avg       0.95      0.95      0.95        30
weighted avg       0.94      0.93      0.93        30



In [29]:
confusion_matrix(y_test_multi, logistic_ovr.predict(x_test_multi_scaled))

array([[ 5,  0,  0],
       [ 0, 11,  2],
       [ 0,  0, 12]])

##### Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset

In [30]:
joblib.dump((logistic_ovr, scaler), 'logistic_ovr.pkl')

['logistic_ovr.pkl']