In [1]:
import pandas as pd

FOLDER_PATH = '/Users/artemzmailov/Desktop/GiveMeSomeCredit/'
train_data_full = pd.read_csv(FOLDER_PATH + 'data/cs-training.csv', index_col = 0)
train_label_full = train_data_full['SeriousDlqin2yrs']
test_data_full = pd.read_csv(FOLDER_PATH + 'data/cs-test.csv', index_col = 0).drop(columns = ['SeriousDlqin2yrs'])

In [2]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(
    train_data_full,
    train_label_full,
    test_size = 0.2, 
    stratify = train_label_full,
    shuffle = True,
    random_state = 42)

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin



class CreditDataPreprocessor(BaseEstimator, TransformerMixin):

    # Полный препроцессинг данных
    
    def __init__(self,
                 NumberOfDependents_fill_value = 0,
                 NumberOfDependents_up_threshold = 10,
                 MonthlyIncome_fill_value = 0,
                 RevolvingUtilizationOfUnsecuredLines_drop_threshold = 2,
                 age_low_drop_threshold = 18,
                 age_up_drop_threshold = 80,
                 DebtRatio_up_threshold = 5,
                 PastDueRiskScore_weights = [1.0, 1.2, 1.3],
                 NumberRealEstateLoansOrLines_drop_threshold = 20):

        self.NumberOfDependents_fill_value = NumberOfDependents_fill_value
        self.NumberOfDependents_up_threshold = NumberOfDependents_up_threshold
        
        self.MonthlyIncome_fill_value = MonthlyIncome_fill_value

        self.RevolvingUtilizationOfUnsecuredLines_drop_threshold = RevolvingUtilizationOfUnsecuredLines_drop_threshold
        
        self.age_low_drop_threshold = age_low_drop_threshold
        self.age_up_drop_threshold = age_up_drop_threshold

        self.DebtRatio_up_threshold = DebtRatio_up_threshold

        self.PastDueRiskScore_weights = PastDueRiskScore_weights

        self.NumberRealEstateLoansOrLines_drop_threshold = NumberRealEstateLoansOrLines_drop_threshold

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):

        X_copy = X.copy()
        
        X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].fillna(value = self.NumberOfDependents_fill_value)
        X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].clip(0,self.NumberOfDependents_up_threshold).copy()

        X_copy['MonthlyIncomeIsMissing'] = 0
        X_copy.loc[X_copy['MonthlyIncome'].isna(), 'MonthlyIncomeIsMissing'] = 1
        X_copy['MonthlyIncome'] = X['MonthlyIncome'].fillna(value = self.MonthlyIncome_fill_value)

        X_copy['RevolvingUtilizationOverOne'] = 0.0
        X_copy.loc[X_copy['RevolvingUtilizationOfUnsecuredLines'] > 1, 'RevolvingUtilizationOverOne'] = 1.0
        X_copy['RevolvingUtilizationOfUnsecuredLines'] = X_copy['RevolvingUtilizationOfUnsecuredLines'].clip(0,1).copy()

        X_copy['DebtPayments'] = 0.0
        X_copy.loc[X_copy['MonthlyIncome'] == 0,'DebtPayments'] = X_copy.loc[X_copy['MonthlyIncome'] == 0,'DebtRatio']
        X_copy.loc[X_copy['MonthlyIncome'] != 0,'DebtPayments'] = X_copy.loc[X_copy['MonthlyIncome'] != 0,'DebtRatio'] * X_copy.loc[X_copy['MonthlyIncome'] != 0,'MonthlyIncome']
        X_copy['DebtRatio'] = X_copy['DebtRatio'].clip(0,self.DebtRatio_up_threshold).copy()

        X_copy['DebtPayments_over_10k'] = 0.0
        X_copy.loc[X_copy['DebtPayments'] > 10000,'DebtPayments_over_10k'] = 1.0
        X_copy['DebtPayments'] = X_copy['DebtPayments'].clip(0,10000).copy()

        X_copy['MonthlyIncome_over_20k'] = 0.0
        X_copy.loc[X_copy['MonthlyIncome'] >= 20000,'MonthlyIncome_over_20k'] = 1.0
        X_copy['MonthlyIncome'] = X_copy['MonthlyIncome'].clip(0,20000)

        X_copy['Code96'] = 0.0
        X_copy['Code98'] = 0.0
        X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'Code96']  = 1.0
        X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'Code98']  = 1.0

        X_copy['PastDueRiskScore'] = (
            self.PastDueRiskScore_weights[0] * X_copy['NumberOfTime30-59DaysPastDueNotWorse'] +
            self.PastDueRiskScore_weights[1] * X_copy['NumberOfTime60-89DaysPastDueNotWorse'] +
            self.PastDueRiskScore_weights[2] * X_copy['NumberOfTimes90DaysLate'])
        X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'PastDueRiskScore'] = 96
        X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'PastDueRiskScore'] = 98
        X_copy = X_copy.drop(columns = ['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'])

        X_copy['NumberOfOpenCreditLinesAndLoans_over_30'] = 0.0
        X_copy.loc[X_copy['NumberOfOpenCreditLinesAndLoans'] > 30, 'NumberOfOpenCreditLinesAndLoans_over_30'] = 1.0
        X_copy['NumberOfOpenCreditLinesAndLoans'] = X_copy['NumberOfOpenCreditLinesAndLoans'].clip(0,30).copy()

        X_copy['NumberRealEstateLoansOrLines_over_5'] = 0.0
        X_copy.loc[X_copy['NumberRealEstateLoansOrLines'] > 5, 'NumberRealEstateLoansOrLines_over_5'] = 1.0
        X_copy['NumberRealEstateLoansOrLines'] = X_copy['NumberRealEstateLoansOrLines'].clip(0,5).copy()
        
        X_copy['ConsumerCredit_Group'] = pd.cut(X_copy['NumberOfOpenCreditLinesAndLoans'], 
                                        bins = [0,1, 2,6,15,31], 
                                        labels=[
                                            '0_loans',
                                            '1_loans',
                                            '2-5_loans',
                                            '6-14_loans',
                                            '16-30_loans'
                                        ])
        consumer_dummy = pd.get_dummies(X_copy['ConsumerCredit_Group'], prefix='Consumer', drop_first = False).astype('float')
        
        X_copy['RealEstateLoans_Group'] = pd.cut(X_copy['NumberRealEstateLoansOrLines'],
                                           bins=[-1, 0, 3,100], 
                                           labels= [
                                                    '0_loans',      
                                                    '1-3_loans',    
                                                    '4+_loans',    
                                                    ])
        estate_dummy = pd.get_dummies(X_copy['RealEstateLoans_Group'], prefix='RealEstateLoans', drop_first = False).astype('float')

        X_copy = pd.concat([X_copy, consumer_dummy, estate_dummy], axis = 1).copy()
        X_copy = X_copy.drop(columns = ['ConsumerCredit_Group', 
                                                'RealEstateLoans_Group']).copy()
        
        X_copy = X_copy.drop(columns = ['Consumer_6-14_loans',  
                                                'RealEstateLoans_0_loans']).copy()
        

        X_copy = X_copy.drop(columns = ['NumberOfOpenCreditLinesAndLoans',
                                                'NumberRealEstateLoansOrLines',
                                               'MonthlyIncomeIsMissing',
                                               'MonthlyIncome_over_20k',
                                               'Consumer_0_loans',
                                               'NumberOfOpenCreditLinesAndLoans_over_30']).copy()
                
        return X_copy
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


    def clean_train(self, X, y=None):
        mask = (
            (X['RevolvingUtilizationOfUnsecuredLines'] <= self.RevolvingUtilizationOfUnsecuredLines_drop_threshold) &
            (X['age'] >= self.age_low_drop_threshold) & 
            (X['age'] <= self.age_up_drop_threshold) &
            (X['NumberRealEstateLoansOrLines']<=self.NumberRealEstateLoansOrLines_drop_threshold)   
               )

        X_clean = X[mask].copy()

        if y is not None:
            y_clean = y[mask].copy()
            return X_clean, y_clean
            
        return X_clean

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler

class CreditScaler(BaseEstimator, TransformerMixin):
    """
    Масштабирует только не-булевые колонки.
    Можно задать различные способы масштабирования
    """
    
    def __init__(self, scaler_type='standard'):
        """
        Параметр scaler_type - тип scaler'а.
        
        Доступные типы:
        - 'standard': StandardScaler (среднее=0, дисперсия=1)
        - 'robust': RobustScaler (устойчив к выбросам)
        - 'minmax': MinMaxScaler (приводит к [0, 1])
        - 'maxabs': MaxAbsScaler (приводит к [-1, 1])
        """

        self.boolean_columns = [
            'RevolvingUtilizationOverOne',
            'DebtPayments_over_10k',
            'Code96',
            'Code98',
            'NumberRealEstateLoansOrLines_over_5',
            'Consumer_1_loans',
            'Consumer_2-5_loans',
            'Consumer_16-30_loans',
            'RealEstateLoans_1-3_loans',
            'RealEstateLoans_4+_loans'
        ]
        
        self.scaler_type = scaler_type
        self._create_scaler()
        
        # Эти переменные заполнятся во время fit
        self.columns_to_scale_ = None
        self.n_features_in_ = None
        self.feature_names_in_ = None
    
    def _create_scaler(self):
        """Создает scaler по типу"""
        if self.scaler_type == 'standard':
            self.scaler = StandardScaler()
        elif self.scaler_type == 'robust':
            self.scaler = RobustScaler()
        elif self.scaler_type == 'minmax':
            self.scaler = MinMaxScaler()
        elif self.scaler_type == 'maxabs':
            self.scaler = MaxAbsScaler()
        else:
            raise ValueError(
                f"Unknown scaler_type: {self.scaler_type}. "
                f"Available: standard, robust, minmax, maxabs"
            )
    
    def fit(self, X, y=None):
        """
        Определяет колонки для масштабирования (все, кроме булевых)
        и обучает scaler.
        """
        
        self.feature_names_in_ = X.columns.tolist()
        self.n_features_in_ = len(self.feature_names_in_)
        
        self.columns_to_scale_ = [
            col for col in self.feature_names_in_ 
            if col not in self.boolean_columns
        ]
        
        self.scaler.fit(X[self.columns_to_scale_])
        return self
    
    def transform(self, X, y=None):
        """
        Масштабирует только не-булевы колонки.
        """
        X_copy = X.copy()
        
        X_copy[self.columns_to_scale_] = self.scaler.transform(X_copy[self.columns_to_scale_])
        
        return X_copy
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)
    
    def get_feature_names_out(self, input_features=None):
        """Для совместимости с sklearn"""
        if input_features is not None:
            return input_features
        return self.feature_names_in_ if self.feature_names_in_ is not None else []
    
    def set_params(self, **params):
        """Для совместимости с GridSearchCV"""
        if 'scaler_type' in params and params['scaler_type'] != self.scaler_type:
            self.scaler_type = params['scaler_type']
            self._create_scaler()
        return super().set_params(**params)

In [5]:
train_label = train_data['SeriousDlqin2yrs']
train_data = train_data.drop(columns = ['SeriousDlqin2yrs']).copy()

credit_transformer = CreditDataPreprocessor()

train_data_cleaned, train_label_cleaned = credit_transformer.clean_train(train_data, train_label)

credit_transformer.fit(train_data_cleaned)
train_data_transformed = credit_transformer.transform(train_data_cleaned)
test_data_transformed = credit_transformer.transform(test_data)

credit_scaler_standard = CreditScaler(scaler_type='standard').fit(train_data_transformed)

train_data_scaled = credit_scaler_standard.transform(train_data_transformed)
test_data_scaled = credit_scaler_standard.transform(test_data_transformed)

import joblib
joblib.dump(credit_transformer, FOLDER_PATH + 'processing/credit_transformer_120.pkl')
joblib.dump(credit_scaler_standard, FOLDER_PATH + 'processing/credit_scaler_standard_120.pkl')

train_data_scaled.to_csv(FOLDER_PATH + 'datasets/train_data_120.csv')
train_label_cleaned.to_csv(FOLDER_PATH + 'datasets/train_label_120.csv')
test_data_scaled.to_csv(FOLDER_PATH + 'datasets/test_data_120.csv')

In [6]:
train_label_full = train_data_full['SeriousDlqin2yrs']
train_full = train_data_full.drop(columns = ['SeriousDlqin2yrs']).copy()

credit_transformer_full = CreditDataPreprocessor()
train_full_cleaned, train_label_full_cleaned = credit_transformer_full.clean_train(train_full, train_label_full)
train_full_transformed = credit_transformer_full.fit_transform(train_full_cleaned)

credit_scaler_standard_full = CreditScaler(scaler_type='standard')
train_full_scaled = credit_scaler_standard_full.fit_transform(train_full_transformed)

import joblib
joblib.dump(credit_transformer_full, FOLDER_PATH + 'processing/credit_transformer_150.pkl')
joblib.dump(credit_scaler_standard_full, FOLDER_PATH + 'processing/credit_scaler_standard_150.pkl')

train_full_scaled.to_csv(FOLDER_PATH + 'datasets/train_data_150.csv')
train_label_full_cleaned.to_csv(FOLDER_PATH + 'datasets/train_label_150.csv')

In [7]:
# train_transformer = CreditDataPreprocessor(remove_outliers = False).fit(train_data)
# train_data_transformed = train_transformer.transform(train_data)
# test_data_transformed = train_transformer.transform(test_data)

# joblib.dump(train_transformer, FOLDER_PATH + 'models/credit_preprocessor.pkl')

# y_train = train_data_transformed['SeriousDlqin2yrs']
# y_val = test_data_transformed['SeriousDlqin2yrs']

# X_train = train_data_transformed.drop(columns = ['SeriousDlqin2yrs'])
# X_val = test_data_transformed.drop(columns = ['SeriousDlqin2yrs'])

# X_train_scaled_df, X_val_scaled_df = scaling_function(
#     X_train, 
#     X_val)


In [8]:
# full_transformed = CreditDataPreprocessor(remove_outliers = False).fit(train_data_full)
# train_full_transformed = full_transformed.transform(train_data_full)
# Kaggle_test_full_transformed = full_transformed.transform(test_data_full)

# full_transformed = CreditDataPreprocessor(remove_outliers = True).fit(train_data_full)
# train_full_transformed = full_transformed.transform(train_data_full)

In [9]:
# from sklearn.linear_model import LogisticRegression
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.inspection import permutation_importance



# y_train = train_data_transformed['SeriousDlqin2yrs']
# y_val = test_data_transformed['SeriousDlqin2yrs']

# X_train = train_data_transformed.drop(columns = ['SeriousDlqin2yrs'])
# X_val = test_data_transformed.drop(columns = ['SeriousDlqin2yrs'])

# X_train_scaled_df, X_val_scaled_df = scaling_function(
#     X_train, 
#     X_val)


# scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
# results = {}

# models = {
#     'LogisticRegression': LogisticRegression(
#         max_iter=1000,
#         class_weight = 'balanced',
#         random_state = 42   
#     ),
    
#     'RandomForest': RandomForestClassifier(
#         n_estimators=100, 
#         max_depth=5,
#         class_weight = 'balanced',
#         random_state = 42
#     ),
    
#     'XGBoost': XGBClassifier(
#         n_estimators=100, 
#         max_depth=3,
#         scale_pos_weight = scale_pos_weight,
#         eval_metric = 'logloss',
#         random_state = 42
#     )
# }


# print("ROC-AUC:")
# for model_name, model in models.items():
#     if model_name == 'LogisticRegression':
#         X_train_use = X_train_scaled_df
#         X_val_use = X_val_scaled_df
        
#     else:
#         X_train_use = X_train
#         X_val_use = X_val
        
#     model.fit(X_train_use, y_train)
#     y_pred = model.predict_proba(X_val_use)[:,1]
#     roc_auc = roc_auc_score(y_val, y_pred)
#     results[model_name] = roc_auc
#     print(f'{model_name}: {roc_auc:.4f}')

#  # Permutation Importance

# importance_xgb = XGBClassifier(
#         n_estimators=100, 
#         max_depth=3,
#         scale_pos_weight = scale_pos_weight,
#         eval_metric = 'logloss',
#         random_state = 42)

# importance_xgb.fit(X_train, y_train)
    

# perm_result = permutation_importance(
#     importance_xgb,
#     X_val, y_val,
#     n_repeats = 10,
#     scoring = 'roc_auc',
#     random_state = 42)

# perm_importance_df = pd.DataFrame({
#     'feature': X_val.columns,
#     'perm_importance': perm_result.importances_mean,
#     'perm_std': perm_result.importances_std
# }).sort_values('perm_importance', ascending = False)

# print('Permutation Importance')
# print(perm_importance_df)

In [10]:
# train_label = train_full_transformed['SeriousDlqin2yrs']

# train_full_scaled, Kaggle_test_full_scaled = scaling_function(
#     train_full_transformed.drop(columns = 'SeriousDlqin2yrs'), 
#     Kaggle_test_full_transformed)

# train_full_scaled.to_csv(FOLDER_PATH + 'data/train_full_scaled_pipeline_v1.csv')
# Kaggle_test_full_scaled.to_csv(FOLDER_PATH + 'data/Kaggle_test_full_scaled_pipeline_v1.csv')
# train_label.to_csv(FOLDER_PATH + '/data/train_label_pipeline_v1.csv')