In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing as prep
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn import set_config

In [97]:
set_config(display="diagram")

https://optuna.readthedocs.io/en/stable/index.html

In [98]:
!pip install optuna



In [99]:
import optuna

In [100]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
           'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

# Load the dataset into a pandas DataFrame
heart_data = pd.read_csv(url, names=columns)


In [101]:
heart_data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [102]:
# Assuming you have a DataFrame called `df`
for column in heart_data.columns:
    unique_values = heart_data[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print("\n")

Unique values in column 'age':
[63. 67. 37. 41. 56. 62. 57. 53. 44. 52. 48. 54. 49. 64. 58. 60. 50. 66.
 43. 40. 69. 59. 42. 55. 61. 65. 71. 51. 46. 45. 39. 68. 47. 34. 35. 29.
 70. 77. 38. 74. 76.]


Unique values in column 'sex':
[1. 0.]


Unique values in column 'cp':
[1. 4. 3. 2.]


Unique values in column 'trestbps':
[145. 160. 120. 130. 140. 172. 150. 110. 132. 117. 135. 112. 105. 124.
 125. 142. 128. 170. 155. 104. 180. 138. 108. 134. 122. 115. 118. 100.
 200.  94. 165. 102. 152. 101. 126. 174. 148. 178. 158. 192. 129. 144.
 123. 136. 146. 106. 156. 154. 114. 164.]


Unique values in column 'chol':
[233. 286. 229. 250. 204. 236. 268. 354. 254. 203. 192. 294. 256. 263.
 199. 168. 239. 275. 266. 211. 283. 284. 224. 206. 219. 340. 226. 247.
 167. 230. 335. 234. 177. 276. 353. 243. 225. 302. 212. 330. 175. 417.
 197. 198. 290. 253. 172. 273. 213. 305. 216. 304. 188. 282. 185. 232.
 326. 231. 269. 267. 248. 360. 258. 308. 245. 270. 208. 264. 321. 274.
 325. 235. 257. 164. 141. 252. 2

In [103]:
# Replace '?' with NaN
heart_data.replace('?', pd.NA, inplace=True)
# Convert columns to numeric where possible
heart_disease_data = heart_data.apply(pd.to_numeric, errors='coerce')
# Drop rows with missing values (or handle as needed)
heart_data.dropna(inplace=True)


In [104]:

num_cols =heart_data.select_dtypes(include=np.number).columns
cat_cols = heart_data.columns[heart_data.dtypes == 'object']

print(f"We have {len(num_cols)} numeric columns: {', '.join(num_cols)}")
print(f"And {len(cat_cols)} categorical columns: {', '.join(cat_cols)}")

We have 12 numeric columns: age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, target
And 2 categorical columns: ca, thal


In [105]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    float64
 2   cp        297 non-null    float64
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    float64
 6   restecg   297 non-null    float64
 7   thalach   297 non-null    float64
 8   exang     297 non-null    float64
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    float64
 11  ca        297 non-null    object 
 12  thal      297 non-null    object 
 13  target    297 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 34.8+ KB


In [106]:
heart_data['ca'] = pd.to_numeric(heart_data['ca'], errors='coerce').astype('Int64')
heart_data['thal'] = pd.to_numeric(heart_data['thal'], errors='coerce').astype('Int64')

# Check if the columns have been converted
display(heart_data.dtypes)

Unnamed: 0,0
age,float64
sex,float64
cp,float64
trestbps,float64
chol,float64
fbs,float64
restecg,float64
thalach,float64
exang,float64
oldpeak,float64


In [107]:
heart_data.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,160
1,54
2,35
3,35
4,13


In [108]:
heart_data['target'] =heart_data['target'].apply(lambda x: 1 if x != 0 else 0)

In [109]:
heart_data.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,160
1,137


In [110]:
data = heart_data.drop(columns = ['target'])
target = heart_data['target']

In [111]:
num_cols =data.select_dtypes(include=np.number).columns
num_cols

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [112]:
random_state = 42
X_train, X_val, y_train, y_val = train_test_split(data,target,
                                                  test_size = 0.2,
                                                  random_state = random_state)


The LogisticRegression class can be configured for multinomial logistic regression by setting the “multi_class” argument to “multinomial” and the “solver” argument to a solver that supports multinomial logistic regression, such as “lbfgs“.

define the multinomial logistic regression model

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

The multinomial logistic regression model will be fit using cross-entropy loss and will predict the integer value for each integer encoded class labe

In [113]:

def objective(trial):

    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])

    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()


    degree = trial.suggest_int("degree", 1, 5)
    interaction = trial.suggest_categorical("interaction", [True, False])

    PF = PolynomialFeatures(degree=degree,
                        include_bias=False,
                        interaction_only=interaction)

    numerical_pipe_poly = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('poly_features', PF),
        ('scaler', scaler),
    ])

    preprocessors = ColumnTransformer(transformers=[
        ('num', numerical_pipe_poly, cat_cols),
    ])

    C = trial.suggest_float("С", 1e-2, 1e2, log=True)
    penalty = trial.suggest_categorical("penalty", ['l1','l2'])

    estimator = LogisticRegression(C = C,
                                   penalty = penalty,
                                   random_state=42,
                                   max_iter = 10000,
                                   solver='liblinear')
    pipeline = make_pipeline(preprocessors, estimator)


    scorer = make_scorer(f1_score,
                         average='macro', zero_division = 0)

    cv=ShuffleSplit(n_splits=5,
                    random_state=42)

    score = cross_val_score(pipeline,
                            X_train, y_train,
                            scoring=scorer,
                            cv = cv
                            )
    trial.set_user_attr("score", score)

    final_score = score.mean()

    return final_score



In [114]:
result = optuna.create_study(direction="maximize",
                            study_name="Log_Reg"
                            )
result.optimize(objective,
               n_trials=10,
               )

[I 2024-10-11 15:28:28,171] A new study created in memory with name: Log_Reg
[I 2024-10-11 15:28:28,400] Trial 0 finished with value: 0.7100837997964172 and parameters: {'scalers': 'robust', 'degree': 3, 'interaction': False, 'С': 0.11282313308795684, 'penalty': 'l2'}. Best is trial 0 with value: 0.7100837997964172.
[I 2024-10-11 15:28:28,714] Trial 1 finished with value: 0.7684581093087367 and parameters: {'scalers': 'robust', 'degree': 2, 'interaction': False, 'С': 0.019232303988855236, 'penalty': 'l2'}. Best is trial 1 with value: 0.7684581093087367.
[I 2024-10-11 15:28:28,991] Trial 2 finished with value: 0.7509285202933116 and parameters: {'scalers': 'robust', 'degree': 2, 'interaction': True, 'С': 41.8966682825672, 'penalty': 'l2'}. Best is trial 1 with value: 0.7684581093087367.
[I 2024-10-11 15:28:29,364] Trial 3 finished with value: 0.7509285202933116 and parameters: {'scalers': 'robust', 'degree': 4, 'interaction': True, 'С': 0.2545933202308035, 'penalty': 'l1'}. Best is tria

In [115]:
result.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_degree,params_interaction,params_penalty,params_scalers,params_С,user_attrs_score,state
0,0,0.710084,2024-10-11 15:28:28.173869,2024-10-11 15:28:28.400069,0 days 00:00:00.226200,3,False,l2,robust,0.112823,"[0.6243478260869566, 0.7037037037037037, 0.695...",COMPLETE
1,1,0.768458,2024-10-11 15:28:28.411091,2024-10-11 15:28:28.713555,0 days 00:00:00.302464,2,False,l2,robust,0.019232,"[0.8693284936479129, 0.8222222222222222, 0.664...",COMPLETE
2,2,0.750929,2024-10-11 15:28:28.720798,2024-10-11 15:28:28.990925,0 days 00:00:00.270127,2,True,l2,robust,41.896668,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
3,3,0.750929,2024-10-11 15:28:28.994456,2024-10-11 15:28:29.363090,0 days 00:00:00.368634,4,True,l1,robust,0.254593,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
4,4,0.750929,2024-10-11 15:28:29.369178,2024-10-11 15:28:29.688169,0 days 00:00:00.318991,5,False,l2,minmax,60.132799,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
5,5,0.750929,2024-10-11 15:28:29.699801,2024-10-11 15:28:29.951424,0 days 00:00:00.251623,4,True,l1,standard,0.044509,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
6,6,0.750929,2024-10-11 15:28:29.954755,2024-10-11 15:28:30.093629,0 days 00:00:00.138874,5,True,l2,robust,4.322942,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
7,7,0.750929,2024-10-11 15:28:30.101127,2024-10-11 15:28:30.446844,0 days 00:00:00.345717,4,True,l1,robust,7.433249,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
8,8,0.750929,2024-10-11 15:28:30.454552,2024-10-11 15:28:30.703844,0 days 00:00:00.249292,4,False,l1,minmax,0.499362,"[0.8285714285714285, 0.7037037037037037, 0.695...",COMPLETE
9,9,0.720627,2024-10-11 15:28:30.707340,2024-10-11 15:28:31.233922,0 days 00:00:00.526582,3,False,l1,robust,9.285822,"[0.7078260869565217, 0.7037037037037037, 0.664...",COMPLETE


In [116]:
result.best_trial

FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.7684581093087367], datetime_start=datetime.datetime(2024, 10, 11, 15, 28, 28, 411091), datetime_complete=datetime.datetime(2024, 10, 11, 15, 28, 28, 713555), params={'scalers': 'robust', 'degree': 2, 'interaction': False, 'С': 0.019232303988855236, 'penalty': 'l2'}, user_attrs={'score': array([0.86932849, 0.82222222, 0.66433566, 0.79130435, 0.69509982])}, system_attrs={}, intermediate_values={}, distributions={'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'degree': IntDistribution(high=5, log=False, low=1, step=1), 'interaction': CategoricalDistribution(choices=(True, False)), 'С': FloatDistribution(high=100.0, log=True, low=0.01, step=None), 'penalty': CategoricalDistribution(choices=('l1', 'l2'))}, trial_id=1, value=None)

Save cross-validation results

In [117]:
result.trials[6].user_attrs

{'score': array([0.82857143, 0.7037037 , 0.69509982, 0.83216783, 0.69509982])}

In [118]:
optuna.visualization.plot_optimization_history(result)

In [119]:
optuna.visualization.plot_slice(result)

In [121]:
optuna.visualization.plot_contour(result,['С','degree'])

In [122]:
optuna.visualization.plot_param_importances(result)

In [123]:
result.best_params

{'scalers': 'robust',
 'degree': 2,
 'interaction': False,
 'С': 0.019232303988855236,
 'penalty': 'l2'}

In [124]:
scalers =result.best_params['scalers']
if scalers == "minmax":
    scaler = MinMaxScaler()
elif scalers == "standard":
    scaler = StandardScaler()
else:
    scaler = RobustScaler()

Polynom = PolynomialFeatures(degree=result.best_params['degree'],
                        include_bias=False,
                        interaction_only=result.best_params['interaction'])

numerical_pipe_poly = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly_features', Polynom),
    ('scaler', scaler),
])

categorical_pipe = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent', )),
('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
])

preprocessors = ColumnTransformer(transformers=[
    ('num', numerical_pipe_poly, num_cols),
    #('cat', categorical_pipe, cat_columns)
])

estimator = LogisticRegression(C = result.best_params['С'],
                               penalty = result.best_params['penalty'],
                                random_state=42, max_iter = 10000,
                               solver = 'liblinear')
pipeline = make_pipeline(preprocessors, estimator)
pipeline