In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from pandas import Series, read_csv
from imblearn.under_sampling import RandomUnderSampler

In [106]:
def print_all_scores(y_true: Series, y_pred: Series) -> None:
    acc_score = accuracy_score(y_true, y_pred)
    print("accuracy: ", acc_score)
    pre_score = precision_score(y_true, y_pred)
    print("precision: ", pre_score)
    rec_score = recall_score(y_true, y_pred)
    print("recall: ", rec_score)
    f_score = f1_score(y_true, y_pred)
    print("f1_score: ", f_score)
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion matrix: \n", conf_matrix)

In [107]:
DATASET = read_csv('../data/train/analysis.csv')
DATASET.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58989 entries, 0 to 58988
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cardio      58989 non-null  bool   
 1   is_healthy  58989 non-null  bool   
 2   ap_lo       58989 non-null  float64
 3   gluc        58989 non-null  object 
dtypes: bool(2), float64(1), object(1)
memory usage: 1.0+ MB


In [108]:
DATASET[DATASET.duplicated()].count()

cardio        58750
is_healthy    58750
ap_lo         58750
gluc          58750
dtype: int64

In [109]:
CATEGORICAL_FEATURES = ["gluc"]

In [110]:
X, Y = DATASET.drop(["cardio"], axis=1), DATASET['cardio'] 

In [111]:
DATASET.shape

(58989, 4)

In [112]:
from sklearn.preprocessing import OneHotEncoder
from pandas import DataFrame, concat

one_hot_encoder = OneHotEncoder(sparse_output=False)
dataset_encoded = one_hot_encoder.fit_transform(X[CATEGORICAL_FEATURES])
hot_dataset = DataFrame(dataset_encoded, columns=one_hot_encoder.get_feature_names_out(CATEGORICAL_FEATURES))
X = concat([X, hot_dataset], axis=1)

X = X.drop(CATEGORICAL_FEATURES, axis=1)
X.head()

Unnamed: 0,is_healthy,ap_lo,gluc_ABOVE_NORMAL,gluc_NORMAL,gluc_WELL_ABOVE_NORMAL
0,True,80.0,0.0,1.0,0.0
1,False,90.0,0.0,1.0,0.0
2,False,70.0,0.0,1.0,0.0
3,True,60.0,0.0,1.0,0.0
4,False,80.0,1.0,0.0,0.0


In [113]:




X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
scale = StandardScaler()
under_sampler = RandomUnderSampler()


X_train, X_test = scale.fit_transform(X_train), scale.fit_transform(X_test)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)
print(X_train)

[[-0.66586221  1.34046904 -0.26602127  0.40173164 -0.28035487]
 [-0.66586221  0.05880946 -0.26602127  0.40173164 -0.28035487]
 [-0.66586221  0.05880946 -0.26602127  0.40173164 -0.28035487]
 ...
 [ 1.50181221  0.05880946 -0.26602127  0.40173164 -0.28035487]
 [ 1.50181221  0.05880946 -0.26602127  0.40173164 -0.28035487]
 [ 1.50181221  0.05880946 -0.26602127  0.40173164 -0.28035487]]


In [114]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
print_all_scores(y_predict, y_test)

accuracy:  0.6160931231282138
precision:  0.7366511145671333
recall:  0.5441317250622247
f1_score:  0.6259222552582314
Confusion matrix: 
 [[5219 2032]
 [4762 5684]]


In [115]:
ran_for = RandomForestClassifier()
ran_for.fit(X_train, y_train)
y_predict = ran_for.predict(X_test)
print_all_scores(y_predict, y_test)

accuracy:  0.6173362716844663
precision:  0.7253758424053914
recall:  0.5460487804878049
f1_score:  0.6230657909384393
Confusion matrix: 
 [[5328 2119]
 [4653 5597]]
