In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data_tms.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'data_tms.csv'

In [None]:
def process_data(df):
    data = df.copy()

    def get_title(name):
        match = re.search(r'(Mr|Mrs|Miss|Master|Dr|Rev|Col|Major|Mlle|Ms|Lady|Sir|Don|Mme|Countess|Jonkheer|Capt)', name)
        if match:
            return match.group(0)
        return "Other"

    data["Title"] = data["Name"].apply(get_title)

    median_ages = data.groupby("Title")["Age"].median()

    def fill_age(row):
        if pd.isnull(row["Age"]):
            title = row["Title"]
            if title in median_ages and not np.isnan(median_ages[title]):
                return median_ages[title]
            else:
                return data.groupby(["Sex", "Pclass"])["Age"].transform("median")[row.name]
        return row["Age"]

    data["Age"] = data.apply(fill_age, axis=1)

    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

    le_sex = LabelEncoder()
    data['Sex_encoded'] = le_sex.fit_transform(data['Sex'])

    data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 19, 35, 60, 100],
                             labels=[0, 1, 2, 3, 4]).astype(int)

    return data

df_processed = process_data(df)

all_features = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'FamilySize', 'IsAlone', 'AgeGroup']
available_features = [col for col in all_features if col in df_processed.columns]

In [None]:
X_full = df_processed[available_features]
y = df_processed['Survived']
X_train_full, X_test_full, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_full_scaled = scaler.transform(X_test_full)

In [None]:
class LogReg:
    def __init__(self, learning_rate=0.01, n_iterations=1000, tolerance=1e-6):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.tolerance = tolerance
        self.weights = None
        self.bias = None
        self.cost_history = []

    def _sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def cost(self, X, y):
        m = len(y)
        z = np.dot(X, self.weights) + self.bias
        h = self._sigmoid(z)

        e = 1e-15
        h = np.clip(h, e, 1 - e)

        cost = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))
        return cost

    def fit(self, X, y):
        m, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        prev_cost = float('inf')
        for i in range(self.n_iterations):
            z = np.dot(X, self.weights) + self.bias
            h = self._sigmoid(z)

            dw = np.dot(X.T, (h - y)) / m
            db = np.mean(h - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            cost = self.cost(X, y)
            self.cost_history.append(cost)
            prev_cost = cost

    def predict_prob(self, X):
        z = np.dot(X, self.weights) + self.bias
        return self._sigmoid(z)

    def predict(self, X):
        return (self.predict_prob(X) >= 0.5).astype(int)


In [None]:
def fitting(train_score, test_score, limit=0.03, norm=0.7):
    diff = abs(train_score - test_score)
    if diff > limit:
        if train_score < norm and test_score < norm:
            fitting = "НЕДООБУЧЕНИЕ"
        else:
            fitting = "ПЕРЕОБУЧЕНИЕ"
    else:
        fitting = "НОРМАЛЬНОЕ ОБУЧЕНИЕ"

    return {
        'fitting': fitting,
        'train_test_diff': diff,
    }

In [None]:
print("МОДЕЛЬ 1:НЕДООБУЧЕНИЕ")
minimal_features = ['Pclass']
X_train_min = X_train_full[minimal_features]
X_test_min = X_test_full[minimal_features]

scaler_min = StandardScaler()
X_train_min_scaled = scaler_min.fit_transform(X_train_min)
X_test_min_scaled = scaler_min.transform(X_test_min)


model_underfit = LogReg(learning_rate=0.0001, n_iterations=20)
model_underfit.fit(X_train_min_scaled, y_train)

y_pred_underfit = model_underfit.predict(X_test_min_scaled)
test_accuracy_underfit = accuracy_score(y_test, y_pred_underfit)
y_train_pred_underfit = model_underfit.predict(X_train_min_scaled)
train_accuracy_underfit = accuracy_score(y_train, y_train_pred_underfit)

underfit_fitting = fitting(train_accuracy_underfit, test_accuracy_underfit)
print(f"Признаков: {len(minimal_features)}")
print("Параметры: learning_rate=0.0001, n_iterations=20")
print(f"Train Accuracy: {train_accuracy_underfit:.4f}")
print(f"Test Accuracy: {test_accuracy_underfit:.4f}")
print(f"Разница: {underfit_fitting['train_test_diff']:.4f}")
print(f"Корректно ли обучение: {underfit_fitting['fitting']}")

МОДЕЛЬ 1:НЕДООБУЧЕНИЕ
Признаков: 1
Параметры: learning_rate=0.0001, n_iterations=20
Train Accuracy: 0.6882
Test Accuracy: 0.6425
Разница: 0.0457
Корректно ли обучение: НЕДООБУЧЕНИЕ


In [None]:
print("МОДЕЛЬ 2:ПЕРЕОБУЧЕНИЕ")

X_train_over = X_train_full.copy()
X_test_over = X_test_full.copy()

numeric_features = ['Pclass', 'Sex_encoded', 'Age', 'SibSp', 'Parch', 'FamilySize', 'IsAlone', 'AgeGroup']

for i, col1 in enumerate(numeric_features):
    for col2 in numeric_features:
        if col1 != col2:
            X_train_over[f'{col1}_x_{col2}'] = X_train_over[col1] * X_train_over[col2]
            X_test_over[f'{col1}_x_{col2}'] = X_test_over[col1] * X_test_over[col2]

np.random.seed(42)
for i in range(100):
    X_train_over[f'noise_{i}'] = np.random.normal(0, 1, len(X_train_over))
    X_test_over[f'noise_{i}'] = np.random.normal(0, 1, len(X_test_over))

for i in range(50):
    X_train_over[f'noise_uniform_{i}'] = np.random.uniform(-1, 1, len(X_train_over))
    X_test_over[f'noise_uniform_{i}'] = np.random.uniform(-1, 1, len(X_test_over))

print(f"Исходных признаков: {len(numeric_features)}")
print(f"Полный набор признаков: {len(X_train_over.columns)}")

scaler_over = StandardScaler()
X_train_over_scaled = scaler_over.fit_transform(X_train_over)
X_test_over_scaled = scaler_over.transform(X_test_over)


model_overfit = LogReg(learning_rate=2.0, n_iterations=2000)
model_overfit.fit(X_train_over_scaled, y_train)

y_pred_overfit = model_overfit.predict(X_test_over_scaled)
test_accuracy_overfit = accuracy_score(y_test, y_pred_overfit)
y_train_pred_overfit = model_overfit.predict(X_train_over_scaled)
train_accuracy_overfit = accuracy_score(y_train, y_train_pred_overfit)

overfit_fitting = fitting(train_accuracy_overfit, test_accuracy_overfit)
print("Параметры: learning_rate=2.0, n_iterations=2000")
print(f"Train Accuracy: {train_accuracy_overfit:.4f}")
print(f"Test Accuracy: {test_accuracy_overfit:.4f}")
print(f"Разница: {overfit_fitting['train_test_diff']:.4f}")
print(f"Корректно ли обучение: {overfit_fitting['fitting']}")

МОДЕЛЬ 2:ПЕРЕОБУЧЕНИЕ
Исходных признаков: 8
Полный набор признаков: 214
Параметры: learning_rate=2.0, n_iterations=2000
Train Accuracy: 0.8624
Test Accuracy: 0.6927
Разница: 0.1696
Корректно ли обучение: ПЕРЕОБУЧЕНИЕ


In [None]:
print("МОДЕЛЬ 3: КОРРЕКТНОЕ ОБУЧЕНИЕ")
optimal_features = ['Pclass', 'Sex_encoded', 'Age', 'FamilySize']
X_train_opt = X_train_full[optimal_features]
X_test_opt = X_test_full[optimal_features]

scaler_opt = StandardScaler()
X_train_opt_scaled = scaler_opt.fit_transform(X_train_opt)
X_test_opt_scaled = scaler_opt.transform(X_test_opt)

model_correct = LogReg(learning_rate=0.1, n_iterations=1000)
model_correct.fit(X_train_opt_scaled, y_train)

y_pred_correct = model_correct.predict(X_test_opt_scaled)
test_accuracy_correct = accuracy_score(y_test, y_pred_correct)
y_train_pred_correct = model_correct.predict(X_train_opt_scaled)
train_accuracy_correct = accuracy_score(y_train, y_train_pred_correct)

correct_fitting = fitting(train_accuracy_correct, test_accuracy_correct)
print(f"Признаков: {len(optimal_features)}")
print("Параметры: learning_rate=0.1, n_iterations=1000")
print(f"Train Accuracy: {train_accuracy_correct:.4f}")
print(f"Test Accuracy: {test_accuracy_correct:.4f}")
print(f"Разница: {correct_fitting['train_test_diff']:.4f}")
print(f"Корректно ли обучение: {correct_fitting['fitting']}")

МОДЕЛЬ 3: КОРРЕКТНОЕ ОБУЧЕНИЕ
Признаков: 4
Параметры: learning_rate=0.1, n_iterations=1000
Train Accuracy: 0.7893
Test Accuracy: 0.7933
Разница: 0.0040
Корректно ли обучение: НОРМАЛЬНОЕ ОБУЧЕНИЕ


In [None]:
optimal_features = ['Pclass', 'Sex_encoded', 'Age', 'FamilySize']
X_train_opt = X_train_full[optimal_features]
X_test_opt = X_test_full[optimal_features]

scaler_opt = StandardScaler()
X_train_opt_scaled = scaler_opt.fit_transform(X_train_opt)
X_test_opt_scaled = scaler_opt.transform(X_test_opt)

print("1.СВОЯ РЕАЛИЗАЦИЯ:")
model_custom = LogReg(learning_rate=0.1, n_iterations=1000)
model_custom.fit(X_train_opt_scaled, y_train)

y_pred_custom_train = model_custom.predict(X_train_opt_scaled)
y_pred_custom_test = model_custom.predict(X_test_opt_scaled)

train_acc_custom = accuracy_score(y_train, y_pred_custom_train)
test_acc_custom = accuracy_score(y_test, y_pred_custom_test)

print(f"   Train Accuracy: {train_acc_custom:.4f}")
print(f"   Test Accuracy:  {test_acc_custom:.4f}")
print(f"   Веса: {model_custom.weights}")

print("2.SCIKIT-LEARN РЕАЛИЗАЦИЯ:")
model_sklearn = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
model_sklearn.fit(X_train_opt_scaled, y_train)

y_pred_sklearn_train = model_sklearn.predict(X_train_opt_scaled)
y_pred_sklearn_test = model_sklearn.predict(X_test_opt_scaled)

train_acc_sklearn = accuracy_score(y_train, y_pred_sklearn_train)
test_acc_sklearn = accuracy_score(y_test, y_pred_sklearn_test)

print(f"   Train Accuracy: {train_acc_sklearn:.4f}")
print(f"   Test Accuracy:  {test_acc_sklearn:.4f}")
print(f"   Веса: {model_sklearn.coef_[0]}")

print("3.СРАВНЕНИЕ МЕТРИК:")
print("   Метрика              Своя реализация  Scikit-learn    Разница")
print(f"   Train Accuracy    |     {train_acc_custom:.4f}      |    {train_acc_sklearn:.4f}    | {abs(train_acc_custom - train_acc_sklearn):.4f}")
print(f"   Test Accuracy     |     {test_acc_custom:.4f}      |    {test_acc_sklearn:.4f}    | {abs(test_acc_custom - test_acc_sklearn):.4f}")

1.СВОЯ РЕАЛИЗАЦИЯ:
   Train Accuracy: 0.7893
   Test Accuracy:  0.7933
   Веса: [-1.0118029  -1.31441958 -0.55556327 -0.30506512]
2.SCIKIT-LEARN РЕАЛИЗАЦИЯ:
   Train Accuracy: 0.7879
   Test Accuracy:  0.7933
   Веса: [-0.99246823 -1.2956085  -0.54204518 -0.29541199]
3.СРАВНЕНИЕ МЕТРИК:
   Метрика              Своя реализация  Scikit-learn    Разница
   Train Accuracy    |     0.7893      |    0.7879    | 0.0014
   Test Accuracy     |     0.7933      |    0.7933    | 0.0000
