In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

ОБРАБОТКА ДАННЫХ

In [22]:
train = pd.read_csv('/kaggle/input/mai-ml-contest-1/train.csv')

In [3]:
#видны нехорошие выборы в riskscore
#train.hist(figsize=(20, 20), bins=30)

In [23]:
# кредитный скоринг не зависит от времени
train.drop(columns=['ApplicationDate'], inplace=True)
# создаем равноценные колонки для написанных ниже атрибутов, удаляя последние колонки, чтобы не было ЛЗ
train = pd.get_dummies(train, columns=['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus'], drop_first=True)

In [24]:
# удаляем обнаруженные выбросы
train = train[(train['RiskScore'] >= 0) & (train['RiskScore'] <= 100)]

In [25]:
# так как образование имеет порядковый смысл, присваеваем ему соответствующие значения
def educgen(state):
    if state == 'High School':
        return 1
    if state == 'Associate':
        return 2
    if state == 'Bachelor':
        return 3
    if state == 'Master':
        return 4
    if state == 'Doctorate':
        return 5
    else:
        return state

train['EducationLevel'] = train['EducationLevel'].apply(educgen)

In [26]:
# смотрим как выглядят пропуски
train[train['CreditScore'].isna()]

Unnamed: 0,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,NumberOfOpenCreditLines,NumberOfCreditInquiries,...,MaritalStatus_Widowed,HomeOwnershipStatus_Other,HomeOwnershipStatus_Own,HomeOwnershipStatus_Rent,LoanPurpose_Debt Consolidation,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,EmploymentStatus_Self-Employed,EmploymentStatus_Unemployed
13,27.0,15000.0,,,48.0,5.0,,0.130650,4.0,0.0,...,False,True,False,False,False,False,False,False,False,False
17,43.0,43876.0,,,24.0,3.0,,0.339279,7.0,0.0,...,False,False,False,False,False,False,False,False,False,False
25,56.0,15000.0,,,12.0,1.0,,0.261838,5.0,2.0,...,False,False,True,False,False,False,False,False,False,False
80,44.0,136297.0,,,36.0,0.0,,0.377793,5.0,1.0,...,False,False,False,True,False,False,False,False,False,False
95,23.0,203619.0,,,24.0,0.0,,0.429417,2.0,1.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10925,51.0,15000.0,,,36.0,0.0,,0.448984,1.0,4.0,...,False,False,True,False,False,False,False,False,False,False
10949,27.0,300000.0,,,72.0,2.0,,0.479158,3.0,1.0,...,False,True,False,False,False,False,False,False,False,False
10986,36.0,15000.0,,,60.0,0.0,,0.557669,3.0,1.0,...,False,False,False,False,False,False,False,False,False,False
11008,37.0,300000.0,,,60.0,3.0,,0.320043,0.0,3.0,...,False,False,False,True,False,False,False,False,True,False


In [8]:
# смело удаляем пропущенные значения, потому что они не единичны в строках
train.dropna(inplace=True)

In [9]:
# отделяем таргет
X = train.drop(columns=['RiskScore'])
y = train["RiskScore"]

In [10]:
corr_matrix = X.corr()
high_corr = corr_matrix[(corr_matrix >= 0.9) & (corr_matrix != 1.)].stack()
print(high_corr)

Age               Experience          0.982958
AnnualIncome      MonthlyIncome       0.984978
TotalAssets       NetWorth            0.993996
MonthlyIncome     AnnualIncome        0.984978
Experience        Age                 0.982958
NetWorth          TotalAssets         0.993996
BaseInterestRate  InterestRate        0.975344
InterestRate      BaseInterestRate    0.975344
dtype: float64


In [11]:
# удаляем коррелирующие столбцы, чтоб можно было обойтись без регуляризации
corr_columns = ['Experience', 'AnnualIncome', 'TotalAssets', 'BaseInterestRate']
X.drop(columns=corr_columns, inplace=True)

FEATURE ENGINEERING

In [12]:
scaler = StandardScaler()
nums = X.select_dtypes(include=['float64', 'int32', 'int64']).columns
X[nums] = scaler.fit_transform(X[nums])
X = X.astype('float64')

model = LinearRegression()
model.fit(X, y)

# выясняем самые убойные фичи
coeffs = model.coef_
features = np.abs(coeffs)
ranked = np.argsort(features)[::-1] 
print(ranked)

[25  1 37 10 36  9 17 21 24  6 13 11 22 23  2 30 31 33  5 16 26 20 28  3
 27 29 34  8 18 19  7  0 15 35 32 12 14  4]


In [13]:
X = train.drop(columns=['RiskScore'])
y = train["RiskScore"]
corr_columns = ['Experience', 'AnnualIncome', 'TotalAssets', 'BaseInterestRate']
X.drop(columns=corr_columns, inplace=True)

In [14]:
# связь между возрастом и кредитным рейтингом
X['new1'] = X['Age'] * X['CreditScore']
# связь между месячным заработком и величиной заема
X['new2'] = X['MonthlyIncome'] * X['LoanAmount']
# связь между возрастом и продолжительнностью кредитной истории
X['new3'] = X['LengthOfCreditHistory'] * X['Age']

# связь между сбережениями и заемом
X['new4'] = X['SavingsAccountBalance'] / X['LoanAmount']
# определяем связь между всем нажитым и величиной заема
X['new5'] = X['LoanAmount'] / X['NetWorth']

# смотрим, если больше половины составляем доля долга к заработку
X['new6'] = X['DebtToIncomeRatio'] > 0.5

# возводим в квадррат самые влияющие признаки
X['new7'] = X['CreditScore'] * X['CreditScore']
X['new8'] = X['MonthlyIncome'] * X['MonthlyIncome']
X['new9'] = X['DebtToIncomeRatio'] * X['DebtToIncomeRatio']
X['new10'] = X['LengthOfCreditHistory'] * X['LengthOfCreditHistory']
X['new11'] = X['TotalLiabilities'] * X['TotalLiabilities']
X['new12'] = X['NetWorth'] * X['NetWorth']
X['new13'] = X['TotalDebtToIncomeRatio'] * X['TotalDebtToIncomeRatio']
X['new14'] = X['CheckingAccountBalance'] * X['CheckingAccountBalance']
X['new15'] = X['NumberOfDependents'] * X['NumberOfDependents']

# возводим в куб самые-самые влияющие признаки
X['new16'] = X['CreditScore'] * X['CreditScore'] * X['CreditScore']
X['new17'] = X['DebtToIncomeRatio'] * X['DebtToIncomeRatio'] * X['DebtToIncomeRatio']
X['new18'] = X['NetWorth'] * np.sqrt(X['NetWorth'])
X['new19'] = X['MonthlyIncome'] * X['MonthlyIncome'] * X['MonthlyIncome']

# логарифмируем важные данные с хвостами
X['new20'] = np.log(X['CreditScore'])
X['new21'] = np.log(X['NetWorth'])

# таким образом, мы получаем полином из самых влияющих признаков, чего, к сожалению, регрессионная 
# модель сама построить не может

НОРМАЛИЗАЦИЯ

In [15]:
# нормализуем только числовые значения
scaler = StandardScaler()
nums = X.select_dtypes(include=['float64', 'int32', 'int64']).columns
X[nums] = scaler.fit_transform(X[nums])
X = X.astype('float64')

ОБУЧЕНИЕ

In [16]:
model = LinearRegression()
model.fit(X, y)

СОЗДАНИЕ ФАЙЛА

In [17]:
X_test = pd.read_csv('/kaggle/input/mai-ml-contest-1/test.csv')

X_test.drop(columns=['ApplicationDate'], inplace=True)
X_test.drop(columns=['ID'], inplace=True)

X_test = pd.get_dummies(X_test, columns=['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus'], drop_first=True)

X_test['EducationLevel'] = X_test['EducationLevel'].apply(educgen)



X_test['new1'] = X_test['Age'] * X_test['CreditScore']
X_test['new2'] = X_test['MonthlyIncome'] * X_test['LoanAmount']
X_test['new3'] = X_test['LengthOfCreditHistory'] * X_test['Age']

X_test['new4'] = X_test['SavingsAccountBalance'] / X_test['LoanAmount']
X_test['new5'] = X_test['LoanAmount'] / X_test['NetWorth']

X_test['new6'] = X_test['DebtToIncomeRatio'] > 0.5

X_test['new7'] = X_test['CreditScore'] * X_test['CreditScore']
X_test['new8'] = X_test['MonthlyIncome'] * X_test['MonthlyIncome']
X_test['new9'] = X_test['DebtToIncomeRatio'] * X_test['DebtToIncomeRatio']
X_test['new10'] = X_test['LengthOfCreditHistory'] * X_test['LengthOfCreditHistory']
X_test['new11'] = X_test['TotalLiabilities'] * X_test['TotalLiabilities']
X_test['new12'] = X_test['NetWorth'] * X_test['NetWorth']
X_test['new13'] = X_test['TotalDebtToIncomeRatio'] * X_test['TotalDebtToIncomeRatio']
X_test['new14'] = X_test['CheckingAccountBalance'] * X_test['CheckingAccountBalance']
X_test['new15'] = X_test['NumberOfDependents'] * X_test['NumberOfDependents']

X_test['new16'] = X_test['CreditScore'] * X_test['CreditScore'] * X_test['CreditScore']
X_test['new17'] = X_test['DebtToIncomeRatio'] * X_test['DebtToIncomeRatio'] * X_test['DebtToIncomeRatio']
X_test['new18'] = X_test['NetWorth'] * np.sqrt(X_test['NetWorth'])
X_test['new19'] = X_test['MonthlyIncome'] * X_test['MonthlyIncome'] * X_test['MonthlyIncome']

X_test['new20'] = np.log(X_test['CreditScore'])
X_test['new21'] = np.log(X_test['NetWorth'])



scaler = StandardScaler()
nums = X_test.select_dtypes(include=['float64', 'int32', 'int64']).columns
X_test[nums] = scaler.fit_transform(X_test[nums])
X_test = X_test.astype('float64')

corr_columns = ['Experience', 'AnnualIncome', 'TotalAssets', 'BaseInterestRate']
X_test.drop(columns=corr_columns, inplace=True)

y_pred = model.predict(X_test)
df = pd.DataFrame({
    'ID': range(0, len(y_pred)), 
    'RiskScore': y_pred
})

df.to_csv('submission.csv', index=False)
