### Logistic Regression Multi class classification - multinomial

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import joblib

In [6]:
# Import the dataset
from sklearn.datasets import load_iris

In [7]:
# Save the data in the variable
data = load_iris()

In [8]:
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [9]:
# This is a 2D array, we need to create a DataFrame out of it
df = pd.DataFrame(data.data, columns=data.feature_names)

In [10]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [11]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
df['target'] = data.target

In [13]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [14]:
# Let's see the names of the target variables
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [15]:
# Let's create one more column with target variable names
# we need to do a mapping
df['target'].map({i:name for i,name in enumerate(data.target_names)})

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: target, Length: 150, dtype: object

In [16]:
df['target_names'] = df['target'].map({i:name for i,name in enumerate(data.target_names)})

In [17]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


### ML Model creation steps

- As there is no feature engineering required we need to perform the model creation steps. 
- We need to follow th below steps to create the model
    - Step1: Create x(indipendent) and y(dependent) variables
        - Note: Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>
    - Step2: Create train and test data for both x and y variables
    - Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
        - For x_train: do the "`scaler.fit_transform(x_train)`"
        - for x_test: do the "`scaler.transform(x_test)`" --  Because we dont want the model to know about my testing dataset
    - Step4: Create the model object 
    - Step5: Create the model by passing the "x_train_scaled" and "y_train" data to the model
    - Step6: Predict the output of an unseen data using the model we have build
    - Step7: Test the model using various metrices
    - Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset
    

#### Step1: Create x(indipendent) and y(dependent) variables

- Note: 
Remember the <font color = "#68B6FF">**x variable should be a 2D array or DataFrame** whereas **y can be a Series or 1D array**</font>

In [18]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [19]:
x_multi = df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]

In [20]:
y_multi = df['target']

##### Step2: Create train and test data for both x and y variables

In [21]:
x_train_multi, x_test_multi, y_train_multi, y_test_multi = train_test_split(x_multi, y_multi, test_size=0.2)

##### Step3: Scale the indipendent variable(both x_train and x_test) using StandardScaler() 
- For x_train: do the "`scaler.fit_transform(x_train_multi)`"
- for x_test: do the "`scaler.transform(x_test_multi)`" --  Because we dont want the model to know about my testing dataset

In [22]:
scaler = StandardScaler()
x_train_multi_scaled = scaler.fit_transform(x_train_multi)
x_test_multi_scaled = scaler.transform(x_test_multi)

In [23]:
x_train_multi_scaled

array([[-1.040672  ,  1.18375166, -1.30249145, -1.27000868],
       [ 0.51286706, -1.34464995,  0.64498598,  0.38929914],
       [-1.27967801, -0.19537649, -1.30249145, -1.14236962],
       [ 1.70789711, -0.42523118,  1.42397695,  0.77221634],
       [-0.08464796, -0.88494056,  0.7562704 ,  0.8998554 ],
       [-1.040672  ,  0.72404228, -1.24684924, -1.27000868],
       [ 1.82740011, -0.65508587,  1.31269252,  0.8998554 ],
       [-1.51868402,  1.18375166, -1.5250603 , -1.27000868],
       [ 0.99087908, -0.19537649,  0.70062819,  0.64457727],
       [-0.92116899,  1.41360635, -1.24684924, -1.01473056],
       [ 0.87137608, -0.42523118,  0.47805934,  0.13402102],
       [-0.20415097, -0.42523118,  0.25549049,  0.13402102],
       [ 0.51286706, -1.80435933,  0.36677492,  0.13402102],
       [-1.39918101,  0.2643329 , -1.19120703, -1.27000868],
       [ 0.39336406, -2.03421402,  0.42241713,  0.38929914],
       [ 0.39336406,  0.72404228,  0.92319704,  1.41041165],
       [-1.39918101,  0.

In [24]:
x_test_multi_scaled

array([[ 0.27386105, -0.42523118,  0.53370155,  0.26166008],
       [-1.160175  , -1.34464995,  0.42241713,  0.64457727],
       [ 0.27386105, -0.65508587,  0.14420607,  0.13402102],
       [ 2.18590913, -0.65508587,  1.6465458 ,  1.02749446],
       [ 1.11038209,  0.2643329 ,  1.2014081 ,  1.41041165],
       [-0.44315698, -1.57450464, -0.02272057, -0.24889617],
       [-1.87719303, -0.19537649, -1.46941809, -1.39764775],
       [-0.56265998,  0.72404228, -1.13556481, -1.27000868],
       [ 0.63237007, -0.88494056,  0.86755482,  0.8998554 ],
       [-1.040672  , -1.80435933, -0.24528942, -0.24889617],
       [-0.32365397, -0.42523118, -0.07836278,  0.13402102],
       [-0.92116899,  0.95389697, -1.30249145, -1.14236962],
       [ 0.75187307, -0.65508587,  0.47805934,  0.38929914],
       [-1.040672  ,  0.95389697, -1.19120703, -0.75945243],
       [-0.44315698, -1.80435933,  0.14420607,  0.13402102],
       [-1.75769003,  0.2643329 , -1.35813366, -1.27000868],
       [ 0.27386105, -0.

##### Step4: Create the LogisticRegression Multi Class model object using Multinomial

In [25]:
logistic_multinomial = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)

##### Step5: Create the model by passing the "x_train_multi_scaled" and "y_train_multi" data to the model

In [26]:
logistic_multinomial.fit(x_train_multi_scaled, y_train_multi)



##### Step6: Predict the output of an unseen data using the model we have build

In [27]:
logistic_multinomial.predict(x_test_multi_scaled)

array([1, 1, 1, 2, 2, 1, 0, 0, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 0,
       2, 2, 2, 1, 1, 1, 1, 0])

##### Step7: Test the model using various metrices

In [28]:
print("Logistic Regression Multinomial testing: \n:", classification_report(y_test_multi, logistic_multinomial.predict(x_test_multi_scaled)))

Logistic Regression Multinomial testing: 
:               precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.93      1.00      0.96        13
           2       1.00      0.89      0.94         9

    accuracy                           0.97        30
   macro avg       0.98      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



In [29]:
confusion_matrix(y_test_multi, logistic_multinomial.predict(x_test_multi_scaled))

array([[ 8,  0,  0],
       [ 0, 13,  0],
       [ 0,  1,  8]])

##### Step8: Finally crete the pickle file to use it in the application and predect the outcome of any unseen dataset

In [30]:
joblib.dump((logistic_multinomial, scaler), 'logistic_multinomial.pkl')

['logistic_multinomial.pkl']