In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv("/kaggle/input/mai-ml-lab-1-fiit-2025/train.csv")
test = pd.read_csv("/kaggle/input/mai-ml-lab-1-fiit-2025/test.csv")

print("Было строк в train:", len(train))

good_mask = (train['RiskScore'] >= 0) & (train['RiskScore'] <= 100)
train = train[good_mask]

print("Стало строк после очистки:", len(train))
print("RiskScore — min:", train['RiskScore'].min(), "max:", train['RiskScore'].max())

print("Размер датасета:", train.shape)
train.head()

In [None]:
train.info()
train.describe(include='all')

In [None]:
train["ApplicationDate"] = pd.to_datetime(train["ApplicationDate"])
train.isna().sum().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(train["RiskScore"], kde=True, bins=40)
plt.title("Распределение RiskScore")
plt.show()

In [None]:
num_features = [
    "Age", "AnnualIncome", "CreditScore", "LoanAmount",
    "LoanDuration", "MonthlyDebtPayments", "DebtToIncomeRatio",
    "NetWorth"
]

train[num_features].hist(figsize=(14, 10), bins=30)
plt.tight_layout()
plt.show()

In [None]:
features_to_plot = [
    "CreditScore", "AnnualIncome", "LoanAmount",
    "DebtToIncomeRatio", "MonthlyDebtPayments", "NetWorth"
]

plt.figure(figsize=(14, 10))
for i, feat in enumerate(features_to_plot, 1):
    plt.subplot(2, 3, i)
    sns.scatterplot(x=train[feat], y=train["RiskScore"], alpha=0.5)
    plt.title(f"{feat} vs RiskScore")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(16, 12))
corr = train.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation Matrix")
plt.show()

corr["RiskScore"].sort_values(ascending=False).head(15)


Разведочный анализ данных показал, что признаки в датасете в большинстве своём слабо коррелируют друг с другом и практически не имеют линейной зависимости с целевой переменной RiskScore. В данных присутствует значительное количество пропусков, особенно в ключевых финансовых характеристиках, что снижает качество статистических связей и влияет на корректность корреляционного анализа. Распределение RiskScore выглядит почти равномерным и не демонстрирует выраженной зависимости от входных признаков, что может указывать на его синтетическую природу.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns

z_scaler = StandardScaler()
df_zscore = train.copy()
df_zscore[numeric_cols] = z_scaler.fit_transform(train[numeric_cols])

print("Z-score normalized data (первые 5 строк):")
display(df_zscore.head())

mm_scaler = MinMaxScaler()
df_minmax = train.copy()
df_minmax[numeric_cols] = mm_scaler.fit_transform(train[numeric_cols])

print("Min-Max normalized data (первые 5 строк):")
display(df_minmax.head())


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

class MyLinearRegression:
    def __init__(self, lambda_reg=1e-5):
        self.weights = None
        self.lambda_reg = lambda_reg
       
    def fit(self, X, y, method="normal", lr=0.01, iters=2000, batch_size=32):
        X = np.nan_to_num(X)
        y = np.nan_to_num(y)
        m, n = X.shape
        X_b = np.hstack([np.ones((m, 1)), X])
       
        if method == "normal":
            I = np.eye(n + 1)
            I[0, 0] = 0
            reg_matrix = X_b.T @ X_b + self.lambda_reg * I
            try:
                self.weights = np.linalg.solve(reg_matrix, X_b.T @ y)
            except:
                self.weights = np.linalg.pinv(reg_matrix) @ (X_b.T @ y)
           
        elif method == "gd":
            self.weights = np.zeros(n + 1)
            for _ in range(iters):
                y_pred = X_b @ self.weights
                grad = (2 / m) * X_b.T @ (y_pred - y)
                self.weights -= lr * grad
               
        elif method == "sgd":
            self.weights = np.zeros(n + 1)
            for epoch in range(iters):
                lr_epoch = lr / (1 + 0.001 * epoch)
                indices = np.random.permutation(m)
                X_b_shuf = X_b[indices]
                y_shuf = y[indices]
                for i in range(0, m, batch_size):
                    X_batch = X_b_shuf[i:i+batch_size]
                    y_batch = y_shuf[i:i+batch_size]
                    grad = 2 * X_batch.T @ (X_batch @ self.weights - y_batch) / len(X_batch)
                    self.weights -= lr_epoch * grad
                   
    def predict(self, X):
        X = np.nan_to_num(X)  # защита и здесь!
        X_b = np.hstack([np.ones((X.shape[0], 1)), X])
        return X_b @ self.weights

df = train.copy()
if 'ApplicationDate' in df.columns:
    date = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    df['app_month'] = date.dt.month.fillna(0)
    df['app_weekday'] = date.dt.weekday.fillna(0)
    df['app_is_weekend'] = (date.dt.weekday >= 5).astype(int).fillna(0)
    df = df.drop('ApplicationDate', axis=1, errors='ignore')


eps = 1
if all(c in df.columns for c in ['CreditScore', 'Income']):
    df['credit_income_ratio'] = df['CreditScore'] / (df['Income'] + eps)
if all(c in df.columns for c in ['Debt', 'Income']):
    df['dti'] = df['Debt'] / (df['Income'] + eps)
if all(c in df.columns for c in ['LoanAmount', 'Income']):
    df['pti'] = df['LoanAmount'] / (df['Income'] + eps)
if 'Income' in df.columns:
    df['log_income'] = np.log1p(df['Income'])
if 'Age' in df.columns:
    df['age_sq'] = df['Age'] ** 2
if 'CreditScore' in df.columns:
    df['credit_sq'] = df['CreditScore'] ** 2

y_raw = pd.to_numeric(df['RiskScore'], errors='coerce')
y = y_raw.fillna(y_raw.median() if not y_raw.isna().all() else 0).values

X = df.drop(columns=['RiskScore'], errors='ignore')

cat_cols = X.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    top = X[col].value_counts().head(15).index
    X[col] = X[col].where(X[col].isin(top), 'Other')

X = pd.get_dummies(X, drop_first=True)

X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0, posinf=0.0, neginf=0.0)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=0.0, neginf=0.0)

model_my = MyLinearRegression(lambda_reg=1e-5)
model_my.fit(X_train_scaled, y_train, method="normal")
pred_my = model_my.predict(X_test_scaled)

print(f"Моя модель (нормальное ур.) MSE: {mean_squared_error(y_test, pred_my):.4f}")
print(f"Моя модель R²: {r2_score(y_test, pred_my):.6f}")

model_gd = MyLinearRegression()
model_gd.fit(X_train_scaled, y_train, method="gd", lr=0.05, iters=15000)
print(f"Моя GD MSE: {mean_squared_error(y_test, model_gd.predict(X_test_scaled)):.4f}")

sk = LinearRegression()
sk.fit(X_train_scaled, y_train)
pred_sk = sk.predict(X_test_scaled)
print(f"\nsklearn MSE: {mean_squared_error(y_test, pred_sk):.4f}")
print(f"sklearn R²:  {r2_score(y_test, pred_sk):.6f}")

In [None]:
from sklearn.model_selection import KFold
import numpy as np

def k_fold_cv(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_scores = []

    for train_index, test_index in kf.split(X):
        X_train_cv, X_test_cv = X[train_index], X[test_index]
        y_train_cv, y_test_cv = y[train_index], y[test_index]

        model = MyLinearRegression()
        model.fit(X_train_cv, y_train_cv, method="normal")
        y_pred_cv = model.predict(X_test_cv)
        mse_scores.append(mean_squared_error(y_test_cv, y_pred_cv))

    return np.mean(mse_scores), np.std(mse_scores)

mean_mse, std_mse = k_fold_cv(X_train, y_train, k=5)
print(f"K-Fold CV (k=5) MSE: {mean_mse:.4f} ± {std_mse:.4f}")

def loo_cv(X, y):
    n_samples = X.shape[0]
    mse_scores = []

    for i in range(n_samples):
        X_train_loo = np.delete(X, i, axis=0)
        y_train_loo = np.delete(y, i, axis=0)
        X_test_loo = X[i:i+1]
        y_test_loo = y[i:i+1]

        model = MyLinearRegression()
        model.fit(X_train_loo, y_train_loo, method="normal")
        y_pred_loo = model.predict(X_test_loo)
        mse_scores.append((y_test_loo - y_pred_loo)**2)

    mse_scores = np.array(mse_scores).flatten()
    return np.mean(mse_scores), np.std(mse_scores)

mean_mse_loo, std_mse_loo = loo_cv(X_train, y_train)
print(f"LOO CV MSE: {mean_mse_loo:.4f} ± {std_mse_loo:.4f}")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error as mse_sklearn

def mean_squared_error_manual(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")
    
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

In [None]:
y_pred = model_normal.predict(X_test)

mse_manual = mean_squared_error_manual(y_test, y_pred)
print("Manual MSE:", mse_manual)

mse_skl = mse_sklearn(y_test, y_pred)
print("Sklearn MSE:", mse_skl)

print("Разница:", abs(mse_manual - mse_skl))

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error as mae_sklearn

def mean_absolute_error_manual(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")
    
    mae = np.mean(np.abs(y_true - y_pred))
    return mae

In [None]:
y_pred = model_normal.predict(X_test)

mae_manual = mean_absolute_error_manual(y_test, y_pred)
print("Manual MAE:", mae_manual)

mae_skl = mae_sklearn(y_test, y_pred)
print("Sklearn MAE:", mae_skl)

print("Разница:", abs(mae_manual - mae_skl))

In [None]:
import numpy as np
from sklearn.metrics import r2_score as r2_sklearn

def r2_score_manual(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")

    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    r2 = 1 - ss_res / ss_total
    return r2

In [None]:
y_pred = model_normal.predict(X_test)

r2_manual = r2_score_manual(y_test, y_pred)
print("Manual R²:", r2_manual)

r2_skl = r2_sklearn(y_test, y_pred)
print("Sklearn R²:", r2_skl)

print("Разница:", abs(r2_manual - r2_skl))

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape_sklearn

def mean_absolute_percentage_error_manual_safe(y_true, y_pred, eps=1e-2):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")

    mask = np.abs(y_true) > eps
    if not np.any(mask):
        raise ValueError("Все значения y_true слишком малы для вычисления MAPE")
    
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
    return mape

In [None]:
y_pred = model_normal.predict(X_test)

mae_manual = mean_absolute_error_manual(y_test, y_pred)
print("Manual MAE:", mae_manual)

mae_skl = mae_sklearn(y_test, y_pred)
print("Sklearn MAE:", mae_skl)

print("Разница:", abs(mae_manual - mae_skl))

In [None]:
import numpy as np
from sklearn.metrics import r2_score as r2_sklearn

def r2_score_manual(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")

    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    r2 = 1 - ss_res / ss_total
    return r2

In [None]:
y_pred = model_normal.predict(X_test)

r2_manual = r2_score_manual(y_test, y_pred)
print("Manual R²:", r2_manual)

r2_skl = r2_sklearn(y_test, y_pred)
print("Sklearn R²:", r2_skl)

print("Разница:", abs(r2_manual - r2_skl))

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape_sklearn

def mean_absolute_percentage_error_manual_safe(y_true, y_pred, eps=1e-2):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true и y_pred должны иметь одинаковую форму")

    mask = np.abs(y_true) > eps
    if not np.any(mask):
        raise ValueError("Все значения y_true слишком малы для вычисления MAPE")
    
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
    return mape

In [None]:
mape_manual_safe = mean_absolute_percentage_error_manual_safe(y_test, y_pred)
print("Manual safe MAPE (%):", mape_manual_safe)

# Для sklearn можно использовать тот же подход:
y_test_safe = y_test[np.abs(y_test) > 1e-2]
y_pred_safe = y_pred[np.abs(y_test) > 1e-2]
mape_skl_safe = mape_sklearn(y_test_safe, y_pred_safe) * 100
print("Sklearn safe MAPE:", mape_skl_safe)

print("Разница:", abs(r2_manual - r2_skl))

In [None]:
test = pd.read_csv("/kaggle/input/mai-ml-lab-1-fiit-2025/test.csv")
test_ids = test["ID"]

df_test = test.copy()

if 'ApplicationDate' in df_test.columns:
    date = pd.to_datetime(df_test['ApplicationDate'], errors='coerce')
    df_test['app_month'] = date.dt.month.fillna(0)
    df_test['app_weekday'] = date.dt.weekday.fillna(0)
    df_test['app_is_weekend'] = (date.dt.weekday >= 5).astype(int).fillna(0)
    df_test = df_test.drop('ApplicationDate', axis=1, errors='ignore')

eps = 1
if all(c in df_test.columns for c in ['CreditScore', 'Income']):
    df_test['credit_income_ratio'] = df_test['CreditScore'] / (df_test['Income'] + eps)
if all(c in df_test.columns for c in ['Debt', 'Income']):
    df_test['dti'] = df_test['Debt'] / (df_test['Income'] + eps)
if all(c in df_test.columns for c in ['LoanAmount', 'Income']):
    df_test['pti'] = df_test['LoanAmount'] / (df_test['Income'] + eps)
if 'Income' in df_test.columns:
    df_test['log_income'] = np.log1p(df_test['Income'])
if 'Age' in df_test.columns:
    df_test['age_sq'] = df_test['Age'] ** 2
if 'CreditScore' in df_test.columns:
    df_test['credit_sq'] = df_test['CreditScore'] ** 2

cat_cols = df_test.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    if col in X.columns:
        top_cats = X[col].value_counts().head(15).index
        df_test[col] = df_test[col].where(df_test[col].isin(top_cats), 'Other')

X_submit = df_test.drop(columns=['ID'], errors='ignore')
X_submit = pd.get_dummies(X_submit, drop_first=True)
X_submit = X_submit.reindex(columns=X.columns, fill_value=0)
X_submit = X_submit.fillna(X.median(numeric_only=True))
X_submit_scaled = scaler.transform(X_submit)
X_submit_scaled = np.nan_to_num(X_submit_scaled, nan=0.0, posinf=0.0, neginf=0.0)

y_pred_submit = model_my.predict(X_submit_scaled)

submission = pd.DataFrame({
    "ID": test_ids,
    "RiskScore": y_pred_submit.flatten()
})
submission.to_csv("submission.csv", index=False)
print("submission.csv готов)
print(f"Предсказания: min={y_pred_submit.min():.2f}, max={y_pred_submit.max():.2f}, mean={y_pred_submit.mean():.2f}")