Датасеты:

https://www.kaggle.com/competitions/titanic

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/rules

Обучаем через модели sklearn

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Загрузка данных
train_data_titanic = pd.read_csv('titanic/train.csv')

# Обработка данных: выбор признаков и заполнение пропущенных значений
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
train_data_titanic = train_data_titanic[features + ['Survived']].dropna()

X_titanic = train_data_titanic[features]
y_titanic = train_data_titanic['Survived']

# Разбиение на обучающую и тестовую выборки
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_titanic, y_titanic, test_size=0.2, random_state=42)

# Нормализация
scaler = StandardScaler()
X_train_titanic = scaler.fit_transform(X_train_titanic)
X_test_titanic = scaler.transform(X_test_titanic)

# Обучение модели KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_titanic, y_train_titanic)

# Оценка качества модели
y_pred_titanic = knn_classifier.predict(X_test_titanic)
print("Titanic Dataset - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))
y_pred_proba_titanic = knn_classifier.predict_proba(X_test_titanic)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test_titanic, y_pred_proba_titanic))

Titanic Dataset - Classification Metrics:
Accuracy: 0.6293706293706294
Precision: 0.5333333333333333
Recall: 0.42857142857142855
F1 Score: 0.4752475247524753
ROC AUC Score: 0.6476806239737274


In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Загрузка данных
train_data_house = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')

# Предварительная обработка данных: выбор признаков и заполнение пропущенных значений
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF']
train_data_house = train_data_house[features + ['SalePrice']].dropna()

X_house = train_data_house[features]
y_house = train_data_house['SalePrice']

# Разбиение на обучающую и тестовую выборки
X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(X_house, y_house, test_size=0.2, random_state=42)

# Нормализация
scaler = StandardScaler()
X_train_house = scaler.fit_transform(X_train_house)
X_test_house = scaler.transform(X_test_house)

# Обучение модели KNN
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train_house, y_train_house)

# Оценка качества модели
y_pred_house = knn_regressor.predict(X_test_house)
print("\nHouse Prices Dataset - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))


House Prices Dataset - Regression Metrics:
Mean Absolute Error (MAE): 21665.454794520545
Mean Squared Error (MSE): 1291465266.301644
Root Mean Squared Error (RMSE): 35936.96239669742
R-squared (R²): 0.8316283778007334


Улучшение бейзлайна

In [49]:
from sklearn.model_selection import GridSearchCV

# Пример от поиска гиперпараметров KNN для классификации
param_grid_classifier = {'n_neighbors': range(1, 20), 'weights': ['uniform', 'distance']}
grid_classifier = GridSearchCV(KNeighborsClassifier(), param_grid_classifier, scoring='f1', cv=5)
grid_classifier.fit(X_train_titanic, y_train_titanic)  # X_train_titanic и y_train_titanic должны быть предобработанными

# Лучшая модель
best_knn_classifier = grid_classifier.best_estimator_

# Оценка точности улучшенной модели
y_pred_titanic = best_knn_classifier.predict(X_test_titanic)
print("Improved Titanic Dataset - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))
y_pred_proba_titanic = best_knn_classifier.predict_proba(X_test_titanic)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test_titanic, y_pred_proba_titanic))

# Подбор гиперпараметров для регрессии
param_grid_regressor = {'n_neighbors': range(1, 20), 'weights': ['uniform', 'distance']}
grid_regressor = GridSearchCV(KNeighborsRegressor(), param_grid_regressor, scoring='neg_mean_squared_error', cv=5)
grid_regressor.fit(X_train_house, y_train_house)  # X_train_house и y_train_house должны быть предобработанными

# Лучшая модель
best_knn_regressor = grid_regressor.best_estimator_

# Оценка точности улучшенной модели
y_pred_house = best_knn_regressor.predict(X_test_house)
print("\nImproved House Prices Dataset - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))

Improved Titanic Dataset - Classification Metrics:
Accuracy: 0.6643356643356644
Precision: 0.6052631578947368
Recall: 0.4107142857142857
F1 Score: 0.48936170212765956
ROC AUC Score: 0.6841133004926108

Improved House Prices Dataset - Regression Metrics:
Mean Absolute Error (MAE): 20815.701633934554
Mean Squared Error (MSE): 1126672963.2892282
Root Mean Squared Error (RMSE): 33565.94946205497
R-squared (R²): 0.8531127708449308


Собственные имплементации

In [53]:
import numpy as np
from collections import Counter

class KNNClassifier:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)  # Приводим y_train к numpy массиву для корректного индексирования

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Вычисляем расстояние до всех точек в тренировочных данных
        distances = np.linalg.norm(self.X_train - x, axis=1)
        # Находим индексы ближайших соседей
        k_indices = np.argsort(distances)[:self.n_neighbors]
        # Собираем метки классов этих соседей
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Возвращаем наиболее частую метку
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Аналогично, для регрессии
class KNNRegressor:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)  # Приводим y_train к numpy массиву для корректного индексирования

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Вычисляем расстояние до всех точек в тренировочных данных
        distances = np.linalg.norm(self.X_train - x, axis=1)
        # Находим индексы ближайших соседей
        k_indices = np.argsort(distances)[:self.n_neighbors]
        # Среднее значение для регрессии
        return np.mean([self.y_train[i] for i in k_indices])

In [54]:
classifier = KNNClassifier(n_neighbors=5)
classifier.fit(X_train_titanic, y_train_titanic)
y_pred_titanic = classifier.predict(X_test_titanic)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("Custom KNNClassifier Titanic Dataset - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))

Custom KNNClassifier Titanic Dataset - Classification Metrics:
Accuracy: 0.6223776223776224
Precision: 0.5227272727272727
Recall: 0.4107142857142857
F1 Score: 0.46


In [56]:
regressor = KNNRegressor(n_neighbors=5)
regressor.fit(X_train_house, y_train_house)
y_pred_house = regressor.predict(X_test_house)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("\nCustom KNNRegressor House Prices Dataset - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))


Custom KNNRegressor House Prices Dataset - Regression Metrics:
Mean Absolute Error (MAE): 21674.221917808216
Mean Squared Error (MSE): 1291750751.233151
Root Mean Squared Error (RMSE): 35940.93420089621
R-squared (R²): 0.831591158401741


Улучшения

In [57]:
# Усовершенствованный KNN
class ImprovedKNNClassifier:
    def __init__(self, n_neighbors=5, weighted=False):
        self.n_neighbors = n_neighbors
        self.weighted = weighted

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.n_neighbors]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        if self.weighted:
            weights = 1 / (distances[k_indices] + 1e-5)  # добавление EPS для избежания деления на 0
            weighted_votes = {}
            for idx, label in enumerate(k_nearest_labels):
                if label in weighted_votes:
                    weighted_votes[label] += weights[idx]
                else:
                    weighted_votes[label] = weights[idx]
            most_common = max(weighted_votes, key=weighted_votes.get)
            return most_common
        else:
            most_common = Counter(k_nearest_labels).most_common(1)
            return most_common[0][0]

# Для регрессии
class ImprovedKNNRegressor:
    def __init__(self, n_neighbors=5, weighted=False):
        self.n_neighbors = n_neighbors
        self.weighted = weighted

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.n_neighbors]
        if self.weighted:
            weights = 1 / (distances[k_indices] + 1e-5)
            weighted_average = np.dot(weights, [self.y_train[i] for i in k_indices]) / np.sum(weights)
            return weighted_average
        else:
            return np.mean([self.y_train[i] for i in k_indices])

In [59]:
classifier = ImprovedKNNClassifier(n_neighbors=5, weighted=True)
classifier.fit(X_train_titanic, y_train_titanic)
y_pred_titanic = classifier.predict(X_test_titanic)

print("Improved Custom KNNClassifier with Weights - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))

regressor = ImprovedKNNRegressor(n_neighbors=5, weighted=True)
regressor.fit(X_train_house, y_train_house)
y_pred_house = regressor.predict(X_test_house)

print("\nImproved Custom KNNRegressor with Weights - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))
...

Improved Custom KNNClassifier with Weights - Classification Metrics:
Accuracy: 0.6363636363636364
Precision: 0.5476190476190477
Recall: 0.4107142857142857
F1 Score: 0.46938775510204084

Improved Custom KNNRegressor with Weights - Regression Metrics:
Mean Absolute Error (MAE): 20982.29819728557
Mean Squared Error (MSE): 1089580748.176257
Root Mean Squared Error (RMSE): 33008.79804198052
R-squared (R²): 0.8579485775774027


Ellipsis