In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Wczytywanie i przygotowanie danych
def load_and_prepare_data():
    data = pd.read_csv('train_data.csv', sep=';')

    le = LabelEncoder()
    categorical_columns = ['Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
                          'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission',
                          'Severity of Illness', 'Age','Stay']
    for col in categorical_columns:
        if col in data.columns:
            data[col] = le.fit_transform(data[col].astype(str))

    data.drop(['case_id', 'patientid'], axis=1, errors='ignore', inplace=True)
    data.fillna(data.mean(numeric_only=True), inplace=True)

    if 'Stay' in data.columns:
        stay_mapping = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4,
                       '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9, 'More than 100 Days': 10}
        data['Stay'] = data['Stay'].map(stay_mapping).fillna(0)

    return data

# KNN
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weight_type='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weight_type = weight_type

    def calculate_distance(self, x1, x2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(x1 - x2)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        elif self.distance_metric == 'cosine':
            return 1 - (np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2)))
        return np.linalg.norm(x1 - x2)

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.array([self.calculate_distance(x, x_train) for x_train in self.X_train])
            nearest_indices = np.argsort(distances)[:self.k]
            k_nearest = self.y_train[nearest_indices]

            if self.weight_type == 'uniform':
                prediction = Counter(k_nearest).most_common(1)[0][0]
            else:
                weights = 1 / (distances[nearest_indices] + 1e-8)
                vote_weight = {}
                for label, weight in zip(k_nearest, weights):
                    vote_weight[label] = vote_weight.get(label, 0) + weight
                prediction = max(vote_weight, key=vote_weight.get)

            predictions.append(prediction)
        return np.array(predictions)

# Drzewo decyzyjne
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_leaf=1, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion

    def fit(self, X, y):
        self.tree = self._build(X, y)

    def _build(self, X, y, depth=0):
        if depth >= self.max_depth or len(set(y)) == 1 or len(y) < 2 * self.min_samples_leaf:
            return Counter(y).most_common(1)[0][0]

        best_feature, best_thresh, best_gain = None, None, -np.inf
        impurity = self._impurity(y)

        for f in range(X.shape[1]):
            for t in np.unique(X[:, f]):
                left, right = y[X[:, f] <= t], y[X[:, f] > t]
                if len(left) < self.min_samples_leaf or len(right) < self.min_samples_leaf: continue
                gain = impurity - len(left)/len(y)*self._impurity(left) - len(right)/len(y)*self._impurity(right)
                if gain > best_gain:
                    best_feature, best_thresh, best_gain = f, t, gain

        if best_feature is None:
            return Counter(y).most_common(1)[0][0]

        mask = X[:, best_feature] <= best_thresh
        return {
            'f': best_feature,
            't': best_thresh,
            'l': self._build(X[mask], y[mask], depth+1),
            'r': self._build(X[~mask], y[~mask], depth+1)
        }

    def _impurity(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        if self.criterion == 'gini':
            return 1 - np.sum(probs**2)
        return -np.sum(p * np.log2(p + 1e-8) for p in probs if p > 0)

    def _predict_one(self, x, tree):
        while isinstance(tree, dict):
            tree = tree['l'] if x[tree['f']] <= tree['t'] else tree['r']
        return tree

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])

# Naive Bayes
class NaiveBayes:
    def __init__(self, smoothing=1.0, distribution='gaussian'):
        self.smoothing = smoothing
        self.distribution = distribution

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.priors = {c: np.mean(y == c) for c in self.classes}
        self.stats = {}

        for c in self.classes:
            X_c = X[y == c]
            if self.distribution == 'gaussian':
                self.stats[c] = (np.mean(X_c, axis=0), np.std(X_c, axis=0) + 1e-8)
            else:
                self.stats[c] = [{v: (np.sum(X_c[:, i] == v) + self.smoothing) /
                                  (len(X_c) + self.smoothing * len(np.unique(X[:, i])))
                                  for v in np.unique(X[:, i])}
                                 for i in range(X.shape[1])]

    def _likelihood(self, x, c):
        if self.distribution == 'gaussian':
            mu, sigma = self.stats[c]
            return np.prod(1/(np.sqrt(2*np.pi)*sigma) * np.exp(-0.5*((x-mu)/sigma)**2))
        else:
            return np.prod([self.stats[c][i].get(x[i], self.smoothing / (len(self.stats[c][i]) + self.smoothing))
                            for i in range(len(x))])

    def predict(self, X):
        return np.array([max(self.classes, key=lambda c: self.priors[c] * self._likelihood(x, c)) for x in X])

# Accuracy
calculate_accuracy = lambda y_true, y_pred: np.mean(y_true == y_pred)

# Testy

def main():
    data = load_and_prepare_data()
    X = data.drop('Stay', axis=1).values
    y = data['Stay'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    print("Szybki test trzech modeli:")

    knn = KNN(k=5)
    knn.fit(X_train, y_train)
    print("KNN Accuracy:", calculate_accuracy(y_test, knn.predict(X_test)))

    dt = DecisionTree(max_depth=5)
    dt.fit(X_train, y_train)
    print("Decision Tree Accuracy:", calculate_accuracy(y_test, dt.predict(X_test)))

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    print("Naive Bayes Accuracy:", calculate_accuracy(y_test, nb.predict(X_test)))

if __name__ == '__main__':
    main()


Szybki test trzech modeli:
