In [9]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
def load_and_prepare_data(sample_frac=0.3):
    data = pd.read_csv('train_data.csv', sep=';')
    data = data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
    data.drop(['case_id', 'patientid'], axis=1, errors='ignore', inplace=True)

    # Mapowanie Stay - łączenie klas powyżej 40 dni w jedną kategorię 4
    if 'Stay' in data.columns and data['Stay'].dtype == object:
        stay_mapping = {
            '0-10': 0,
            '11-20': 1,
            '21-30': 2,
            '31-40': 3,
            # Wszystkie powyżej 40 dni łączymy w jedną kategorię 4
            '41-50': 4,
            '51-60': 4,
            '61-70': 4,
            '71-80': 4,
            '81-90': 4,
            '91-100': 4,
            'More than 100 Days': 4
        }
        data['Stay'] = data['Stay'].map(stay_mapping).fillna(0).astype(int)

    data.fillna(data.mean(numeric_only=True), inplace=True)
    categorical_cols = data.select_dtypes(include=['object']).columns
    data = pd.get_dummies(data, columns=categorical_cols)

    return data


def tune_knn(X_train, y_train, X_test, y_test):
    best_acc = 0
    best_params = None
    for k in [5, 10, 15, 20]:
        for metric in ['euclidean', 'manhattan']:
            for weights in ['uniform', 'distance']:
                knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights=weights, n_jobs=-1)
                knn.fit(X_train, y_train)
                y_pred = knn.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                if acc > best_acc:
                    best_acc = acc
                    best_params = (k, metric, weights)
    print(f"Best KNN params: k={best_params[0]}, metric={best_params[1]}, weights={best_params[2]}, accuracy={best_acc:.4f}")

def tune_decision_tree(X_train, y_train, X_test, y_test):
    best_acc = 0
    best_params = None
    for depth in [5, 7, 10, 15]:
        for min_leaf in [1, 5, 10]:
            for min_split in [2, 5, 10]:
                dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_leaf,
                                            min_samples_split=min_split, random_state=42)
                dt.fit(X_train, y_train)
                y_pred = dt.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                if acc > best_acc:
                    best_acc = acc
                    best_params = (depth, min_leaf, min_split)
    print(f"Best Decision Tree params: max_depth={best_params[0]}, min_samples_leaf={best_params[1]}, min_samples_split={best_params[2]}, accuracy={best_acc:.4f}")

def main():
    data = load_and_prepare_data(sample_frac=0.1)
    X = data.drop('Stay', axis=1).values.astype(np.float32)
    y = data['Stay'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("Tuning KNN...")
    tune_knn(X_train, y_train, X_test, y_test)

    print("Tuning Decision Tree...")
    tune_decision_tree(X_train, y_train, X_test, y_test)

    # Naive Bayes bez tuningu, bo mniej parametrów
    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred):.4f}")

if __name__ == "__main__":
    main()


Tuning KNN...
Best KNN params: k=20, metric=manhattan, weights=uniform, accuracy=0.3753
Tuning Decision Tree...
Best Decision Tree params: max_depth=7, min_samples_leaf=10, min_samples_split=2, accuracy=0.4578
Naive Bayes Accuracy: 0.3804
