In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [33]:
from sklearn.datasets import make_classification

X,y = make_classification(n_samples=10000, n_features=10, n_clusters_per_class=2, random_state=42)

In [34]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [38]:
# Removing correleated features

def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

corr_features = correlation(pd.DataFrame(X_train), 0.8)

print('correlated features: ', len(set(corr_features)))

X_train = pd.DataFrame(X_train)

X_train.drop(labels=corr_features, axis=1, inplace=True)

X_test = pd.DataFrame(X_test)

X_test.drop(labels=corr_features, axis=1, inplace=True)



correlated features:  2


In [39]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='ovr')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)





In [40]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold()

In [41]:
# Hiperparameter tuning

from sklearn.model_selection import GridSearchCV

penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

params = dict(penalty=penalty, C=c_values, solver=solver)

grid_search = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)

In [42]:
grid_search.fit(X_train, y_train)



200 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mac/Desktop/MLOps/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mac/Desktop/MLOps/venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mac/Desktop/MLOps/venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
       

In [43]:
# Getting best parameters

print(grid_search.best_params_)

# Getting best score

print(grid_search.best_score_)

{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.891375


In [44]:
y_pred = grid_search.predict(X_test)

In [45]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Model accuracy: {accuracy_score(y_test, y_pred)}")

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

Model accuracy: 0.8945
              precision    recall  f1-score   support

           0       0.89      0.90      0.89       997
           1       0.90      0.89      0.89      1003

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000

[[894 103]
 [108 895]]


In [46]:
from sklearn.metrics import accuracy_score

print(f"Model accuracy: {accuracy_score(y_test, y_pred)}")

Model accuracy: 0.8945
