In [6]:
import pandas as pd

df = pd.read_csv('data/lpsa.data', header=None)

df[0] = (df[0] > 0).astype(int)

X = df.drop(0, axis=1)

y = df[0]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0, dual=False, C=10, fit_intercept=True).fit(X_train, y_train)

lr_y_pred = lr.predict(X_test)

df = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=2).fit(X_train, y_train)

df_y_pred = df.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, recall_score

print(accuracy_score(y_test, lr_y_pred))

print(accuracy_score(y_test, df_y_pred))

print(recall_score(y_test, lr_y_pred, average='macro'))

print(recall_score(y_test, df_y_pred, average='macro'))



0.9130434782608695
0.9130434782608695
0.4772727272727273
0.9545454545454546


In [13]:
from sklearn.model_selection import GridSearchCV

lr_param_grid = {'C': [10, 100, 1000], 'penalty': ['l1', 'l2'], 'solver': ['saga', 'sag']}

df_param_grid = {'max_depth': [3, 4, 5], 'min_samples_split': [2, 3, 4]}

lr_cv = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv=5, scoring='accuracy').fit(X_train, y_train)

df_cv = GridSearchCV(estimator=df, param_grid=df_param_grid, cv=5, scoring='accuracy').fit(X_train, y_train)



15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

 0

In [14]:
best_lr_model = lr_cv.best_estimator_

best_df_model = df_cv.best_estimator_

print(lr_cv.best_params_)

print(df_cv.best_params_)

{'C': 100, 'penalty': 'l2', 'solver': 'sag'}
{'max_depth': 3, 'min_samples_split': 2}


In [15]:
best_lr_y_pred = best_lr_model.predict(X_test)

best_df_y_pred = best_df_model.predict(X_test)

print(accuracy_score(y_test, best_lr_y_pred))

print(accuracy_score(y_test, best_df_y_pred))

print(recall_score(y_test, best_lr_y_pred, average='macro'))

print(recall_score(y_test, best_df_y_pred, average='macro'))


0.9130434782608695
0.9130434782608695
0.4772727272727273
0.9545454545454546


In [16]:
best_df_y_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0])

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_test, best_df_y_pred))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.91      0.95        22

    accuracy                           0.91        23
   macro avg       0.67      0.95      0.73        23
weighted avg       0.97      0.91      0.93        23

