In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pandas import Series, read_csv
from imblearn.under_sampling import RandomUnderSampler

from logistic_regression.linear_model import CustomLogisticRegression
from tree_models.decision_tree import CustomDecisionTree

In [57]:
def print_all_scores(y_true: Series, y_pred: Series) -> None:
    acc_score = accuracy_score(y_true, y_pred)
    print("accuracy: ", acc_score)
    pre_score = precision_score(y_true, y_pred)
    print("precision: ", pre_score)
    rec_score = recall_score(y_true, y_pred)
    print("recall: ", rec_score)
    f_score = f1_score(y_true, y_pred)
    print("f1_score: ", f_score)
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion matrix: \n", conf_matrix)

In [58]:
DATASET = read_csv('../data/train/analysis.csv')
DATASET.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58989 entries, 0 to 58988
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cardio      58989 non-null  bool   
 1   is_healthy  58989 non-null  bool   
 2   ap_lo       58989 non-null  float64
 3   gluc        58989 non-null  object 
dtypes: bool(2), float64(1), object(1)
memory usage: 1.0+ MB


In [59]:
DATASET[DATASET.duplicated()].count()

cardio        58750
is_healthy    58750
ap_lo         58750
gluc          58750
dtype: int64

In [60]:
CATEGORICAL_FEATURES = ["gluc"]

In [61]:
X, Y = DATASET.drop(["cardio"], axis=1), DATASET['cardio'] 

In [62]:
DATASET.shape

(58989, 4)

In [63]:
from sklearn.preprocessing import OneHotEncoder
from pandas import DataFrame, concat

one_hot_encoder = OneHotEncoder(sparse_output=False)
dataset_encoded = one_hot_encoder.fit_transform(X[CATEGORICAL_FEATURES])
hot_dataset = DataFrame(dataset_encoded, columns=one_hot_encoder.get_feature_names_out(CATEGORICAL_FEATURES))
X = concat([X, hot_dataset], axis=1)

X = X.drop(CATEGORICAL_FEATURES, axis=1)
X.head()

Unnamed: 0,is_healthy,ap_lo,gluc_ABOVE_NORMAL,gluc_NORMAL,gluc_WELL_ABOVE_NORMAL
0,True,80.0,0.0,1.0,0.0
1,False,90.0,0.0,1.0,0.0
2,False,70.0,0.0,1.0,0.0
3,True,60.0,0.0,1.0,0.0
4,False,80.0,1.0,0.0,0.0


In [64]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
scale = StandardScaler()
under_sampler = RandomUnderSampler()

X_train, X_test = scale.fit_transform(X_train), scale.fit_transform(X_test)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)
print(X_train)

[[-0.66889467  0.06005484 -0.26742764  0.40165032 -0.27889499]
 [-0.66889467  0.06005484 -0.26742764  0.40165032 -0.27889499]
 [ 1.49500369  0.06005484 -0.26742764  0.40165032 -0.27889499]
 ...
 [-0.66889467  1.33618924 -0.26742764  0.40165032 -0.27889499]
 [-0.66889467  1.33618924 -0.26742764  0.40165032 -0.27889499]
 [-0.66889467 -1.21607956  3.73932928 -2.48972785 -0.27889499]]


In [65]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
print_all_scores(y_predict, y_test)

accuracy:  0.6194270215290727
precision:  0.7234975496517926
recall:  0.5499460837172826
f1_score:  0.6248955722639933
Confusion matrix: 
 [[5352 2144]
 [4591 5610]]


In [66]:
ran_for = RandomForestClassifier()
ran_for.fit(X_train, y_train)
y_predict = ran_for.predict(X_test)
print_all_scores(y_predict, y_test)

accuracy:  0.6167146974063401
precision:  0.7436161980913077
recall:  0.5459710254710728
f1_score:  0.6296478296478296
Confusion matrix: 
 [[5148 1988]
 [4795 5766]]


In [67]:
custom_log = CustomLogisticRegression()
y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

custom_log.fit(X_train, y_train)
y_predict = custom_log.predict(X_test)
print_all_scores(y_predict, y_test)

accuracy:  0.61027292761485
precision:  0.7549651792623162
recall:  0.5394894479771449
f1_score:  0.6292932007524858
Confusion matrix: 
 [[4946 1900]
 [4997 5854]]


In [87]:
custom_tree = CustomDecisionTree(max_depth=5)
custom_tree.fit(X_train, y_train)
y_predict = custom_tree.predict(X_test)
print_all_scores(y_predict, y_test)