In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import warnings

warnings.filterwarnings('ignore')

def load_and_prepare_data(sample_frac=0.99):  # Użyj 10% danych
    data = pd.read_csv('train_data.csv', sep=';')
    data = data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)

    le = LabelEncoder()
    categorical_columns = ['Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
                           'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission',
                           'Severity of Illness', 'Age', 'Stay']

    for col in categorical_columns:
        if col in data.columns:
            data[col] = le.fit_transform(data[col].astype(str))

    data.drop(['case_id', 'patientid'], axis=1, errors='ignore', inplace=True)

    # Wypełnij brakujące wartości średnią tylko dla kolumn numerycznych
    data.fillna(data.mean(numeric_only=True), inplace=True)

    # Mapowanie kolumny Stay (jeśli nadal jest string, ale po LabelEncoder już nie powinno być)
    if 'Stay' in data.columns and data['Stay'].dtype == object:
        stay_mapping = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4,
                        '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9, 'More than 100 Days': 10}
        data['Stay'] = data['Stay'].map(stay_mapping).fillna(0)

    return data

calculate_accuracy = lambda y_true, y_pred: np.mean(y_true == y_pred)

def main():
    data = load_and_prepare_data(sample_frac=0.1)  # 10% danych

    X = data.drop('Stay', axis=1).values.astype(np.float32)
    y = data['Stay'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    print("Szybki test trzech modeli (scikit-learn):")

    # KNN - z równoległym przetwarzaniem
    knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs=-1)
    knn.fit(X_train, y_train)
    print("KNN Accuracy:", calculate_accuracy(y_test, knn.predict(X_test)))

    # Drzewo decyzyjne
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    print("Decision Tree Accuracy:", calculate_accuracy(y_test, dt.predict(X_test)))

    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    print("Naive Bayes Accuracy:", calculate_accuracy(y_test, nb.predict(X_test)))

if __name__ == '__main__':
    main()


Szybki test trzech modeli (scikit-learn):
KNN Accuracy: 0.2414695415532761
Decision Tree Accuracy: 0.39669248482311076
Naive Bayes Accuracy: 0.3604772870002093
