In [82]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold

In [7]:
iris = load_iris()
X = iris.data
y = iris.target

In [10]:
print(iris.feature_names)
print(iris.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=42)

# Logistic Regression without scaling, grid search, hyperparameter tuning, and pipeline

In [21]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(
  logreg.score(X_test, y_test)
)

print(
  mean_squared_error(y_pred, y_test)
)

0.9333333333333333
0.06666666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Logistic Regression with scaling, but without grid search, hyperparameter tuning, and pipeline

In [52]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
logreg = LogisticRegression()

logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_test_scaled)

y_score = logreg.score(X_test_scaled, y_test)

print(f"Accuracy: {y_score}")
print(f"MSE: {mean_squared_error(y_pred, y_test)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_pred, y_test))}")

Accuracy: 0.9111111111111111
MSE: 0.08888888888888889
RMSE: 0.29814239699997197


# Logistic Regression with scaling and grid search, but without hyperparameter tuning and pipeline

In [61]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle = True, random_state=42)
logreg_kf_score = cross_val_score(logreg, X_scaled, y, cv = kf)

print(f"Accuracies: {logreg_kf_score}")
print(f"Average accuracy: {np.mean(logreg_kf_score)}")

Accuracies: [1.         0.96666667 0.93333333 0.93333333 0.96666667]
Average accuracy: 0.9600000000000002


# Logistic Regression with scaling, grid search, and hyperparameter tuning, but without pipeline

In [65]:
logreg_params = {
  "tol": np.linspace(0.01, 1.0, 20),
  "C": np.linspace(0.01, 1.0, 20),
  "class_weight": ["balanced", {0:0.8, 1:0.2}],
  "solver": ["newton-cg", "newton-cholesky", "lbfgs", "liblinear", "sag", "saga"]
}

logreg_cv = RandomizedSearchCV(logreg, logreg_params, cv = kf)
logreg_cv.fit(X_scaled, y)

print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Best Accuracy Score: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'tol': 0.16631578947368422, 'solver': 'newton-cg', 'class_weight': 'balanced', 'C': 0.4268421052631579}
Tuned Logistic Regression Best Accuracy Score: 0.9533333333333334


# Logistic Regression with pipeline

In [80]:
steps = [
  (
    "standardization",
    StandardScaler()
  ),
  (
    "logistic_regression",
    LogisticRegression(
      tol = 0.16631578947368422,
      solver = "newton-cg",
      class_weight = "balanced",
      C = 0.4268421052631579
    )
  )
]

pipeline = Pipeline(steps=steps)
pipeline.fit(X_train_scaled, y_train)
y_pred = pipeline.predict(X_test_scaled)
r2_score = pipeline.score(X_test_scaled, y_test)

correct_predictions = (y_pred == y_test)
correct_percentage = (sum(correct_predictions) / len(y_test)) * 100
print(correct_percentage)

91.11111111111111


Since we are classifiying the data, so `pipeline.score(...)` will return the accuracy of the model. Unlike regression models, use `pipeline.r2_score(...)` instead.

In [83]:
print(f"Classification report: {classification_report(y_test, y_pred)}")

Classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.82      0.93      0.87        15
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45

