In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
def extract_date_features(df):
    df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    df['Year'] = df['ApplicationDate'].dt.year
    df['Month'] = df['ApplicationDate'].dt.month
    df['Day'] = df['ApplicationDate'].dt.day
    df['DayOfWeek'] = df['ApplicationDate'].dt.dayofweek
    df.drop('ApplicationDate', axis=1, inplace=True)
    return df

In [4]:
df_train = extract_date_features(df_train)
df_test = extract_date_features(df_test)

df_train['Age_bin'] = pd.cut(df_train['Age'], bins=[0,30,50,100], labels=['young','middle','senior'])
df_test['Age_bin'] = pd.cut(df_test['Age'], bins=[0,30,50,100], labels=['young','middle','senior'])

income_bins = [df_train['AnnualIncome'].min(), 30000, 50000, 80000, 120000, df_train['AnnualIncome'].max()]
df_train['Income_bin'] = pd.cut(df_train['AnnualIncome'], bins=income_bins, labels=['very_low','low','med','high','very_high'])
df_test['Income_bin'] = pd.cut(df_test['AnnualIncome'], bins=income_bins, labels=['very_low','low','med','high','very_high'])

df_train['CreditScore_bin'] = pd.cut(df_train['CreditScore'], bins=[0,579,669,739,799,850], labels=['poor','fair','good','very_good','excellent'])
df_test['CreditScore_bin'] = pd.cut(df_test['CreditScore'], bins=[0,579,669,739,799,850], labels=['poor','fair','good','very_good','excellent'])

df_train['DTI_category'] = pd.cut(df_train['DebtToIncomeRatio'], bins=[0, 0.2, 0.35, 0.5, 0.7, float('inf')], labels=['excellent', 'good', 'fair', 'poor', 'very_poor'])
df_test['DTI_category'] = pd.cut(df_test['DebtToIncomeRatio'], bins=[0, 0.2, 0.35, 0.5, 0.7, float('inf')], labels=['excellent', 'good', 'fair', 'poor', 'very_poor'])

df_train['LongCreditHistory'] = (df_train['LengthOfCreditHistory'] > 5).astype(str)
df_test['LongCreditHistory'] = (df_test['LengthOfCreditHistory'] > 5).astype(str)

In [5]:
categorical_cols = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 
                   'EducationLevel', 'BankruptcyHistory', 'PreviousLoanDefaults', 
                   'UtilityBillsPaymentHistory', 'PaymentHistory']

categorical_cols += ['Age_bin', 'Income_bin', 'CreditScore_bin', 'DTI_category', 'LongCreditHistory']
numerical_cols = [col for col in df_train.columns if col not in categorical_cols + ['RiskScore', 'ID']]

for col in numerical_cols:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce')
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

In [6]:
print(f"Строк до очистки: {len(df_train)}")
df_train['RiskScore'] = pd.to_numeric(df_train['RiskScore'], errors='coerce')
df_train = df_train.dropna(subset=['RiskScore'])
print(f"Строк после удаления NaN в RiskScore: {len(df_train)}")

q1 = df_train['RiskScore'].quantile(0.04)
q3 = df_train['RiskScore'].quantile(0.96)
df_train = df_train[(df_train['RiskScore'] >= q1) & (df_train['RiskScore'] <= q3)]
print(f"Строк после удаления выбросов: {len(df_train)}")

test_ids = df_test['ID'].copy()
df_test = df_test.drop('ID', axis=1)

Строк до очистки: 11017
Строк после удаления NaN в RiskScore: 10487
Строк после удаления выбросов: 9647


In [7]:
df_train['Income_to_Debt'] = df_train['AnnualIncome'] / (df_train['MonthlyDebtPayments'] + 1)
df_train['Credit_to_Income'] = df_train['CreditScore'] / (df_train['AnnualIncome'] + 1)
df_train['Loan_to_Income'] = df_train['LoanAmount'] / (df_train['AnnualIncome'] + 1)
df_train['Loan_to_NetWorth'] = df_train['LoanAmount'] / (df_train['NetWorth'] + 1)
df_train['RateSpread'] = df_train['InterestRate'] - df_train['BaseInterestRate']

df_test['Income_to_Debt'] = df_test['AnnualIncome'] / (df_test['MonthlyDebtPayments'] + 1)
df_test['Credit_to_Income'] = df_test['CreditScore'] / (df_test['AnnualIncome'] + 1)
df_test['Loan_to_Income'] = df_test['LoanAmount'] / (df_test['AnnualIncome'] + 1)
df_test['Loan_to_NetWorth'] = df_test['LoanAmount'] / (df_test['NetWorth'] + 1)
df_test['RateSpread'] = df_test['InterestRate'] - df_test['BaseInterestRate']

new_numerical_cols = ['Income_to_Debt', 'Credit_to_Income', 'Loan_to_Income', 'Loan_to_NetWorth', 'RateSpread']
numerical_cols = numerical_cols + new_numerical_cols

In [8]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('transform', PowerTransformer(method='yeo-johnson')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X = df_train.drop('RiskScore', axis=1)
y = df_train['RiskScore']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

In [10]:
y_pred_val = model.predict(X_val)

holdout_mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
mape = mean_absolute_percentage_error(y_val, y_pred_val)

print(f'Holdout MSE: {holdout_mse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R²: {r2:.2f}')
print(f'MAPE: {mape:.2f}')

Holdout MSE: 25.10
MAE: 3.61
R²: 0.91
MAPE: 0.08


In [None]:
print("\nCross-Validation Results:")
cv_scores = cross_val_score(model, X_train, y_train, 
                           scoring='neg_mean_squared_error', 
                           cv=5, n_jobs=-1)
cv_mse_scores = -cv_scores
cv_mse_mean = cv_mse_scores.mean()
cv_mse_std = cv_mse_scores.std()

print(f'CV MSE: {cv_mse_mean:.2f} (±{cv_mse_std:.2f})')
print(f'CV MSE scores per fold: {[f"{score:.2f}" for score in cv_mse_scores]}')


Cross-Validation Results:


In [None]:
model.fit(X, y)
y_test_pred = model.predict(df_test)
submission = pd.DataFrame({'ID': test_ids, 'RiskScore': y_test_pred})
submission.to_csv('submission.csv', index=False)
print('Submission file created')