In [1]:
# Необходимые библиотеки
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression

# Загрузка и предобработка данных Titanic для классификации
def load_and_preprocess_titanic_data(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return X, y

# Загрузка и предобработка данных House Prices для регрессии
def load_and_preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return X, y

# Подготовка данных для Titanic
X_titanic, y_titanic = load_and_preprocess_titanic_data('titanic/train.csv')
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_titanic, y_titanic, test_size=0.2, random_state=42)

# Нормализация данных
scaler = StandardScaler()
X_train_titanic = scaler.fit_transform(X_train_titanic)
X_test_titanic = scaler.transform(X_test_titanic)

# Обучение модели логистической регрессии
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_titanic, y_train_titanic)

# Оценка качества модели
y_pred_titanic = logistic_model.predict(X_test_titanic)
print("Logistic Regression Titanic Dataset - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))
y_pred_proba_titanic = logistic_model.predict_proba(X_test_titanic)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test_titanic, y_pred_proba_titanic))

# Подготовка данных для House Prices
X_house, y_house = load_and_preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')
X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(X_house, y_house, test_size=0.2, random_state=42)

# Нормализация данных
X_train_house = scaler.fit_transform(X_train_house)
X_test_house = scaler.transform(X_test_house)

# Обучение модели линейной регрессии
linear_model = LinearRegression()
linear_model.fit(X_train_house, y_train_house)

# Оценка качества модели
y_pred_house = linear_model.predict(X_test_house)
print("\nLinear Regression House Prices Dataset - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))


Logistic Regression Titanic Dataset - Classification Metrics:
Accuracy: 0.7696629213483146
Precision: 0.6794871794871795
Recall: 0.7681159420289855
F1 Score: 0.7210884353741496
ROC AUC Score: 0.8515489961441297

Linear Regression House Prices Dataset - Regression Metrics:
Mean Absolute Error (MAE): 25289.042839905655
Mean Squared Error (MSE): 1591935019.951264
Root Mean Squared Error (RMSE): 39899.06038932827
R-squared (R²): 0.7924553693088549


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score)

# Гипотезы и улучшения для Titanic
def preprocess_titanic_data_v2(filepath):
    data = pd.read_csv(filepath)

    # Пропущенные значения
    data['Age'].fillna(data.groupby('Pclass')['Age'].transform('median'), inplace=True)
    data['Cabin'].fillna('U', inplace=True)
    data['Cabin'] = data['Cabin'].apply(lambda x: x[0])

    # Создание нового признака
    data['FamilySize'] = data['SibSp'] + data['Parch']

    # Кодирование категориальных признаков
    categorical_features = ['Sex', 'Embarked', 'Cabin']
    numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
    
    # Препроцессинг
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)])
    
    X = data.drop(columns=['Survived', 'Name', 'Ticket'])
    y = data['Survived']
    
    return preprocessor, X, y

# Гипотезы и улучшения для House Prices
def preprocess_house_data_v2(filepath):
    data = pd.read_csv(filepath)
    
    # Взаимодействие признаков
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
    
    # Импьютация пропущенных значений
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    
    # Преобразования
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF', 'TotalSF', 'YearBuilt']
    X = data[features]
    y = data['SalePrice']
    
    return StandardScaler(), X, y

# Обучение и оценка логистической регрессии для Titanic
preprocessor, X_titanic, y_titanic = preprocess_titanic_data_v2('titanic/train.csv')
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_titanic, y_titanic, test_size=0.2, random_state=42)

pipeline_lr_titanic = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))])
param_grid = {'classifier__C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(pipeline_lr_titanic, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train_titanic, y_train_titanic)

# Лучшие параметры и оценки
print("Best parameters for Titanic:", grid_search.best_params_)
y_pred_titanic = grid_search.predict(X_test_titanic)
y_pred_proba_titanic = grid_search.predict_proba(X_test_titanic)[:, 1]

print("Improved Logistic Regression Titanic Dataset - Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))
print("ROC AUC Score:", roc_auc_score(y_test_titanic, y_pred_proba_titanic))

# Обучение и оценка линейной регрессии для House Prices
scaler, X_house, y_house = preprocess_house_data_v2('house-prices-advanced-regression-techniques/train.csv')
X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(X_house, y_house, test_size=0.2, random_state=42)

pipeline_lr_house = Pipeline(steps=[('scaler', scaler), ('regressor', LinearRegression())])
pipeline_lr_house.fit(X_train_house, y_train_house)

y_pred_house = pipeline_lr_house.predict(X_test_house)

print("\nImproved Linear Regression House Prices Dataset - Regression Metrics:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))

Best parameters for Titanic: {'classifier__C': 0.1}
Improved Logistic Regression Titanic Dataset - Classification Metrics:
Accuracy: 0.8212290502793296
Precision: 0.8
Recall: 0.7567567567567568
F1 Score: 0.7777777777777778
ROC AUC Score: 0.8850707850707851

Improved Linear Regression House Prices Dataset - Regression Metrics:
Mean Absolute Error (MAE): 25145.12877530173
Mean Squared Error (MSE): 1567149663.3392212
Root Mean Squared Error (RMSE): 39587.24116857881
R-squared (R²): 0.7956866994951537


In [13]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

class CustomLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X, threshold=0.5):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return (y_predicted > threshold).astype(int)


class CustomLinearRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias

            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [15]:
# Подготовка данных Titanic
def preprocess_titanic_data(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return X, y

# Подготовка данных House Prices
def preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return X, y

# Нормализация данных и обучение моделей
def prepare_data_and_train_models(X, y, model_class, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = model_class()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if model_name == "Logistic Regression":
        print(f"{model_name} - Classification Metrics:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred))
        print("F1 Score:", f1_score(y_test, y_pred))
    elif model_name == "Linear Regression":
        print(f"\n{model_name} - Regression Metrics:")
        print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
        print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
        print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
        print("R-squared (R²):", r2_score(y_test, y_pred))

# Данные для Titanic
X_titanic, y_titanic = preprocess_titanic_data('titanic/train.csv')
prepare_data_and_train_models(X_titanic, y_titanic, CustomLogisticRegression, "Logistic Regression")

# Данные для House Prices
X_house, y_house = preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')
prepare_data_and_train_models(X_house, y_house, CustomLinearRegression, "Linear Regression")

Logistic Regression - Classification Metrics:
Accuracy: 0.7808988764044944
Precision: 0.7027027027027027
Recall: 0.7536231884057971
F1 Score: 0.7272727272727273

Linear Regression - Regression Metrics:
Mean Absolute Error (MAE): 25432.43118285412
Mean Squared Error (MSE): 1596556311.3573067
Root Mean Squared Error (RMSE): 39956.93070491409
R-squared (R²): 0.7918528797561012


In [16]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

class ImprovedCustomLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.0):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.regularization_strength = regularization_strength
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / num_samples) * (np.dot(X.T, (y_predicted - y)) + self.regularization_strength * self.weights)
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X, threshold=0.5):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return (y_predicted > threshold).astype(int)


class ImprovedCustomLinearRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.0):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.regularization_strength = regularization_strength
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias

            dw = (1 / num_samples) * (np.dot(X.T, (y_predicted - y)) + self.regularization_strength * self.weights)
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [18]:
# Подготовка данных Titanic
def preprocess_titanic_data(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return X, y

# Подготовка данных House Prices
def preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return X, y

# Нормализация данных и обучение моделей
def prepare_data_and_train_models(X, y, model_class, model_name, regularization_strength):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,


random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = model_class(regularization_strength=regularization_strength)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if model_name == "Logistic Regression":
        print(f"{model_name} - Classification Metrics:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred))
        print("F1 Score:", f1_score(y_test, y_pred))
    elif model_name == "Linear Regression":
        print(f"\n{model_name} - Regression Metrics:")
        print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
        print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
        print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
        print("R-squared (R²):", r2_score(y_test, y_pred))

# Данные для Titanic
X_titanic, y_titanic = preprocess_titanic_data('titanic/train.csv')
prepare_data_and_train_models(X_titanic, y_titanic, ImprovedCustomLogisticRegression, "Logistic Regression", regularization_strength=0.1)

# Данные для House Prices
X_house, y_house = preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')
prepare_data_and_train_models(X_house, y_house, ImprovedCustomLinearRegression, "Linear Regression", regularization_strength=0.1)

Logistic Regression - Classification Metrics:
Accuracy: 0.7808988764044944
Precision: 0.7027027027027027
Recall: 0.7536231884057971
F1 Score: 0.7272727272727273

Linear Regression - Regression Metrics:
Mean Absolute Error (MAE): 25432.0773137528
Mean Squared Error (MSE): 1596580038.4249744
Root Mean Squared Error (RMSE): 39957.227611847324
R-squared (R²): 0.7918497863977451
