In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Wczytywanie i przygotowanie danych
def load_and_prepare_data():
    """Wczytuje i przygotowuje dane do analizy"""
    data = pd.read_csv('train_data.csv', sep=';')

    # Kodowanie zmiennych kategorycznych
    le = LabelEncoder()
    categorical_columns = ['Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
                          'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission',
                          'Severity of Illness', 'Age','Stay']

    for col in categorical_columns:
        if col in data.columns:
            data[col] = le.fit_transform(data[col].astype(str))

    # Usuwanie kolumn identyfikacyjnych
    data = data.drop(['case_id', 'patientid'], axis=1, errors='ignore')

    # Uzupełnianie brakujących wartości
    data = data.fillna(data.mean())

    # Konwersja kolumny Stay na zakres numeryczny (problem klasyfikacji)
    if 'Stay' in data.columns:
        stay_mapping = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4,
                       '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9, 'More than 100 Days': 10}
        data['Stay'] = data['Stay'].map(stay_mapping)
        data['Stay'] = data['Stay'].fillna(0)

    return data

# 1. METODA K-NAJBLIŻSZYCH SĄSIADÓW (KNN)
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weight_type='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weight_type = weight_type

    def calculate_distance(self, x1, x2):
        """Oblicza odległość między dwoma punktami"""
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        elif self.distance_metric == 'cosine':
            dot_product = np.dot(x1, x2)
            norm_x1 = np.linalg.norm(x1)
            norm_x2 = np.linalg.norm(x2)
            return 1 - (dot_product / (norm_x1 * norm_x2))
        else:
            return np.sqrt(np.sum((x1 - x2) ** 2))

    def fit(self, X, y):
        """Trenowanie modelu"""
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """Predykcja dla nowych danych"""
        predictions = []
        for x in X:
            distances = []
            for i, x_train in enumerate(self.X_train):
                dist = self.calculate_distance(x, x_train)
                distances.append((dist, self.y_train[i]))

            # Sortowanie po odległości i wybór k najbliższych
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]

            if self.weight_type == 'uniform':
                # Głosowanie większością
                votes = [vote for _, vote in k_nearest]
                prediction = Counter(votes).most_common(1)[0][0]
            else:
                # Ważone głosowanie
                weighted_votes = {}
                for dist, vote in k_nearest:
                    weight = 1 / (dist + 1e-8)  # Dodajemy małą wartość aby uniknąć dzielenia przez 0
                    if vote in weighted_votes:
                        weighted_votes[vote] += weight
                    else:
                        weighted_votes[vote] = weight
                prediction = max(weighted_votes, key=weighted_votes.get)

            predictions.append(prediction)
        return np.array(predictions)

# 2. DRZEWO DECYZYJNE
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_leaf=1, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.tree = None

    def calculate_impurity(self, y):
        """Oblicza nieczystość węzła"""
        if len(y) == 0:
            return 0

        counts = Counter(y)
        total = len(y)

        if self.criterion == 'gini':
            impurity = 1 - sum((count/total)**2 for count in counts.values())
        else:  # entropy
            impurity = -sum((count/total) * np.log2(count/total + 1e-8) for count in counts.values())

        return impurity

    def find_best_split(self, X, y):
        """Znajduje najlepszy podział"""
        best_feature = None
        best_threshold = None
        best_gain = -1

        current_impurity = self.calculate_impurity(y)

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue

                left_y = y[left_mask]
                right_y = y[right_mask]

                left_impurity = self.calculate_impurity(left_y)
                right_impurity = self.calculate_impurity(right_y)

                gain = current_impurity - (len(left_y)/len(y) * left_impurity +
                                         len(right_y)/len(y) * right_impurity)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold, best_gain

    def build_tree(self, X, y, depth=0):
        """Buduje drzewo rekurencyjnie"""
        if (depth >= self.max_depth or
            len(np.unique(y)) == 1 or
            len(y) < 2 * self.min_samples_leaf):
            return Counter(y).most_common(1)[0][0]

        feature, threshold, gain = self.find_best_split(X, y)

        if feature is None or gain <= 0:
            return Counter(y).most_common(1)[0][0]

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        node = {
            'feature': feature,
            'threshold': threshold,
            'left': self.build_tree(X[left_mask], y[left_mask], depth + 1),
            'right': self.build_tree(X[right_mask], y[right_mask], depth + 1)
        }

        return node

    def fit(self, X, y):
        """Trenowanie drzewa"""
        self.tree = self.build_tree(X, y)

    def predict_single(self, x, tree):
        """Predykcja dla pojedynczej próbki"""
        if not isinstance(tree, dict):
            return tree

        if x[tree['feature']] <= tree['threshold']:
            return self.predict_single(x, tree['left'])
        else:
            return self.predict_single(x, tree['right'])

    def predict(self, X):
        """Predykcja dla zbioru danych"""
        return np.array([self.predict_single(x, self.tree) for x in X])

# 3. NAIWNY KLASYFIKATOR BAYESA
class NaiveBayes:
    def __init__(self, smoothing=1.0, distribution='gaussian'):
        self.smoothing = smoothing
        self.distribution = distribution
        self.class_priors = {}
        self.feature_stats = {}

    def fit(self, X, y):
        """Trenowanie klasyfikatora"""
        self.classes = np.unique(y)
        n_samples = len(y)

        for cls in self.classes:
            class_mask = (y == cls)
            self.class_priors[cls] = np.sum(class_mask) / n_samples

            X_class = X[class_mask]

            if self.distribution == 'gaussian':
                # Parametry rozkładu normalnego
                self.feature_stats[cls] = {
                    'mean': np.mean(X_class, axis=0),
                    'std': np.std(X_class, axis=0) + 1e-8  # Dodajemy małą wartość dla stabilności
                }
            else:
                # Dyskretny rozkład (multinomial)
                self.feature_stats[cls] = {}
                for feature in range(X.shape[1]):
                    feature_values = X_class[:, feature]
                    value_counts = Counter(feature_values)
                    total_count = len(feature_values)

                    # Wygładzanie Laplace'a
                    self.feature_stats[cls][feature] = {}
                    for value in np.unique(X[:, feature]):
                        count = value_counts.get(value, 0)
                        probability = (count + self.smoothing) / (total_count + self.smoothing * len(np.unique(X[:, feature])))
                        self.feature_stats[cls][feature][value] = probability

    def calculate_likelihood(self, x, cls):
        """Oblicza prawdopodobieństwo cechy dla danej klasy"""
        if self.distribution == 'gaussian':
            mean = self.feature_stats[cls]['mean']
            std = self.feature_stats[cls]['std']

            # Prawdopodobieństwo z rozkładu normalnego
            likelihood = np.prod(1 / (np.sqrt(2 * np.pi) * std) *
                               np.exp(-0.5 * ((x - mean) / std) ** 2))
        else:
            likelihood = 1.0
            for feature in range(len(x)):
                value = x[feature]
                if value in self.feature_stats[cls][feature]:
                    likelihood *= self.feature_stats[cls][feature][value]
                else:
                    # Nieznana wartość - używamy wygładzania
                    unique_values = len(self.feature_stats[cls][feature])
                    likelihood *= self.smoothing / (sum(self.feature_stats[cls][feature].values()) * unique_values + self.smoothing)

        return likelihood

    def predict(self, X):
        """Predykcja dla zbioru danych"""
        predictions = []

        for x in X:
            class_probabilities = {}

            for cls in self.classes:
                prior = self.class_priors[cls]
                likelihood = self.calculate_likelihood(x, cls)
                class_probabilities[cls] = prior * likelihood

            predicted_class = max(class_probabilities, key=class_probabilities.get)
            predictions.append(predicted_class)

        return np.array(predictions)

# FUNKCJE POMOCNICZE
def calculate_accuracy(y_true, y_pred):
    """Oblicza dokładność"""
    return np.mean(y_true == y_pred)

def test_knn_parameters(X_train, X_test, y_train, y_test):
    """Testuje różne parametry dla KNN"""
    print("=== ANALIZA PARAMETRÓW KNN ===")

    # Test różnych wartości k
    k_values = [3, 5, 7, 9]
    k_results = []

    for k in k_values:
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        k_results.append(accuracy)
        print(f"k={k}: Dokładność = {accuracy:.4f}")

    # Test różnych metryk odległości
    distance_metrics = ['euclidean', 'manhattan', 'cosine']
    distance_results = []

    print("\nTest metryk odległości:")
    for metric in distance_metrics:
        knn = KNN(k=5, distance_metric=metric)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        distance_results.append(accuracy)
        print(f"{metric}: Dokładność = {accuracy:.4f}")

    # Test różnych typów wag
    weight_types = ['uniform', 'distance']
    weight_results = []

    print("\nTest typów wag:")
    for weight in weight_types:
        knn = KNN(k=5, weight_type=weight)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        weight_results.append(accuracy)
        print(f"{weight}: Dokładność = {accuracy:.4f}")

    return k_results, distance_results, weight_results

def test_tree_parameters(X_train, X_test, y_train, y_test):
    """Testuje różne parametry dla Drzewa Decyzyjnego"""
    print("\n=== ANALIZA PARAMETRÓW DRZEWA DECYZYJNEGO ===")

    # Test różnych głębokości
    max_depths = [3, 5, 7, 10]
    depth_results = []

    for depth in max_depths:
        tree = DecisionTree(max_depth=depth)
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        depth_results.append(accuracy)
        print(f"max_depth={depth}: Dokładność = {accuracy:.4f}")

    # Test różnych min_samples_leaf
    min_samples = [1, 3, 5, 10]
    samples_results = []

    print("\nTest min_samples_leaf:")
    for min_sample in min_samples:
        tree = DecisionTree(max_depth=5, min_samples_leaf=min_sample)
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        samples_results.append(accuracy)
        print(f"min_samples_leaf={min_sample}: Dokładność = {accuracy:.4f}")

    # Test różnych kryteriów
    criteria = ['gini', 'entropy']
    criteria_results = []

    print("\nTest kryteriów podziału:")
    for criterion in criteria:
        tree = DecisionTree(max_depth=5, criterion=criterion)
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        criteria_results.append(accuracy)
        print(f"{criterion}: Dokładność = {accuracy:.4f}")

    return depth_results, samples_results, criteria_results

def test_nb_parameters(X_train, X_test, y_train, y_test):
    """Testuje różne parametry dla Naiwnego Bayesa"""
    print("\n=== ANALIZA PARAMETRÓW NAIWNEGO BAYESA ===")

    # Test różnych wartości wygładzania
    smoothing_values = [0.1, 0.5, 1.0, 2.0]
    smoothing_results = []

    for smoothing in smoothing_values:
        nb = NaiveBayes(smoothing=smoothing)
        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        smoothing_results.append(accuracy)
        print(f"smoothing={smoothing}: Dokładność = {accuracy:.4f}")

    # Test różnych rozkładów
    distributions = ['gaussian', 'multinomial']
    distribution_results = []

    print("\nTest rozkładów:")
    for dist in distributions:
        nb = NaiveBayes(distribution=dist)
        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_test)
        accuracy = calculate_accuracy(y_test, y_pred)
        distribution_results.append(accuracy)
        print(f"{dist}: Dokładność = {accuracy:.4f}")

    return smoothing_results, distribution_results

def main():
    """Główna funkcja programu"""
    print("PROJEKT UCZENIA MASZYNOWEGO")
    print("=" * 50)

    # Wczytanie danych
    data = load_and_prepare_data()
    print(f"Wczytano dane: {data.shape}")
    print(f"Kolumny: {list(data.columns)}")

    # Przygotowanie danych do trenowania
    if 'Stay' in data.columns:
        X = data.drop('Stay', axis=1).values
        y = data['Stay'].values
    else:
        print("Nie znaleziono kolumny docelowej 'Stay'")
        return

    # Podział na zbiory treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    print(f"Zbiór treningowy: {X_train.shape}")
    print(f"Zbiór testowy: {X_test.shape}")
    print(f"Klasy: {np.unique(y)}")

    # Testowanie parametrów dla każdej metody
    knn_results = test_knn_parameters(X_train, X_test, y_train, y_test)
    tree_results = test_tree_parameters(X_train, X_test, y_train, y_test)
    nb_results = test_nb_parameters(X_train, X_test, y_train, y_test)

    # Podsumowanie
    print("\n" + "=" * 50)
    print("PODSUMOWANIE I WNIOSKI:")
    print("=" * 50)

    print("\n1. K-NAJBLIŻSZYCH SĄSIADÓW (KNN):")
    print("   - Metoda oparta na podobieństwie między próbkami")
    print("   - Parametr k wpływa na kompromis między overfitting a underfitting")
    print("   - Różne metryki odległości mogą dawać różne wyniki")

    print("\n2. DRZEWO DECYZYJNE:")
    print("   - Metoda tworząca reguły decyzyjne w formie drzewa")
    print("   - Głębokość drzewa kontroluje złożoność modelu")
    print("   - min_samples_leaf zapobiega przeuczeniu")

    print("\n3. NAIWNY KLASYFIKATOR BAYESA:")
    print("   - Metoda probabilistyczna oparta na twierdzeniu Bayesa")
    print("   - Wygładzanie pomaga w przypadku rzadkich wartości")
    print("   - Zakłada niezależność cech (naiwne założenie)")

    print("\nOGÓLNE WNIOSKI:")
    print("- Wybór parametrów znacząco wpływa na skuteczność modeli")
    print("- Każda metoda ma swoje zalety i wady")
    print("- Ważne jest testowanie różnych konfiguracji parametrów")
    print("- Wyniki mogą się różnić w zależności od charakterystyki danych")

if __name__ == "__main__":
    main()