In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = sns.load_dataset('iris')

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [6]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [7]:
# remove setosa classified rows to make df binary classified for simplification for now

In [8]:
df = df[df['species'] != 'setosa']

In [9]:
df.species.unique()

array(['versicolor', 'virginica'], dtype=object)

In [10]:
df.species = df.species.map({'versicolor': 0, 'virginica': 1})

In [11]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,0
51,6.4,3.2,4.5,1.5,0
52,6.9,3.1,4.9,1.5,0
53,5.5,2.3,4.0,1.3,0
54,6.5,2.8,4.6,1.5,0


In [12]:
df.species.unique()

array([0, 1], dtype=int64)

In [13]:
# split df into independent and dependent features

In [14]:
x = df.iloc[:,:-1]

In [15]:
y = df.iloc[:,-1]

In [16]:
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
50,7.0,3.2,4.7,1.4
51,6.4,3.2,4.5,1.5
52,6.9,3.1,4.9,1.5
53,5.5,2.3,4.0,1.3
54,6.5,2.8,4.6,1.5


In [17]:
y.head()

50    0
51    0
52    0
53    0
54    0
Name: species, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.25, random_state = 42
)

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
classifier = LogisticRegression()

In [22]:
# grid search cv is used for hyperparameter tuning
# it tries all possible combinations to check for which combination it gives the best accuracy
from sklearn.model_selection import GridSearchCV

In [23]:
# these are the parameters and their values to make combinations with in GridSearchCV
parameter = {
    'penalty': [
        'l1',
        'l2',
        'elasticnet'
    ],
    'C': [
        1,
        2,
        3,
        4,
        5,
        6,
        10,
        20,
        30,
        40,
        50
    ],
    'max_iter': [
        100,
        200,
        300
    ]
}

In [24]:
classifier_regressor = GridSearchCV(
    classifier,
    param_grid = parameter,
    scoring = 'accuracy',
    cv = 5
)

In [25]:
classifier_regressor.fit(x_train, y_train)

330 fits failed out of a total of 495.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
165 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ayush\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ayush\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ayush\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solve

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50],
                         'max_iter': [100, 200, 300],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='accuracy')

In [26]:
classifier_regressor.best_params_

{'C': 1, 'max_iter': 100, 'penalty': 'l2'}

In [27]:
print(classifier_regressor.best_score_)

0.9733333333333334


In [28]:
# prediction
y_pred = classifier_regressor.predict(x_test)

In [29]:
y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1], dtype=int64)

In [30]:
# accuracy_score
from sklearn.metrics import accuracy_score, classification_report

In [31]:
score = accuracy_score(y_pred, y_test)
print(score)

0.92


In [32]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        14
           1       0.91      0.91      0.91        11

    accuracy                           0.92        25
   macro avg       0.92      0.92      0.92        25
weighted avg       0.92      0.92      0.92        25

