In [None]:
# ! pip install catboost==1.0.4

In [None]:
import numpy as np
import pandas as pd
import catboost
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
train = pd.read_csv('../input/mlda-2022-classification-competition/train.csv')
test = pd.read_csv('../input/mlda-2022-classification-competition/test.csv')
submission = pd.read_csv('../input/mlda-2022-classification-competition/submission.csv')

In [None]:
all_cat_features = [
    'loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose',
    'Credit_Worthiness', 'open_credit', 'business_or_commercial',
    'Neg_ammortization', 'interest_only', 'lump_sum_payment',
    'occupancy_type', 'total_units',
    'credit_type', 'co-applicant_credit_type', 'age',
    'submission_of_application', 'Region',
    'construction_type', 'Secured_by', 'Security_Type'
]

cat_features = [
    'loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose',
    'Credit_Worthiness', 'open_credit',
    'Neg_ammortization', 'interest_only', 'lump_sum_payment',
    'occupancy_type', 'total_units',
    'credit_type', 'co-applicant_credit_type', 'age',
    'submission_of_application', 'Region', 'income_unk',
    'poor_score', 'fair_score', 'good_score', 'very_good_score', 'excellent_score',
    'unknown_region'
]

num_features = [
    'loan_amount_ln','term_ln', 'income_ln', 'Credit_Score', 'loan_income_ratio_ln',
    'loan_amount_per_month_ln','monthly_loan_income_ratio_ln'
]

In [None]:
def make_features(df):
    df.loc[:, all_cat_features] = df[all_cat_features].fillna('other')
    df['heuristic'] = (
        (df['income'] == 0)
        | (df['construction_type'] == 'mh')
        | (df['Secured_by'] == 'land')
        | (df['credit_type'] == 'EQUI')
        | (df['Security_Type'] == 'Indriect')
        | (df['age'] == 'other')
        | (df['submission_of_application'] == 'other')
        | ((df['loan_limit'] == 'other') & (df['Neg_ammortization'] == 'neg_amm'))
        | ((df['Credit_Worthiness'] == 'l2') & (df['Neg_ammortization'] == 'neg_amm') & (df['open_credit'] == 'nopc'))
    )
    df['income_unk'] = df['income'].isna().astype(int)
    df['income'] = df['income'].fillna(0.0)
    df['income_ln'] = np.log1p(df['income'])
    df['term'] = df['term'].fillna(360.0)
    df['term_ln'] = np.log1p(df['term'])
    df['loan_amount_ln'] = np.log1p(df['loan_amount'])
    df['loan_amount_per_month_ln'] = df['loan_amount_ln'] - df['term_ln']
    df['loan_income_ratio_ln'] = df['loan_amount_ln'] - df['income_ln']
    df['monthly_loan_income_ratio_ln'] = df['loan_amount_per_month_ln'] - df['income_ln']
    df['poor_score'] = (df['Credit_Score'] < 580).astype(int)
    df['fair_score'] = (df['Credit_Score'].between(580, 669)).astype(int)
    df['good_score'] = (df['Credit_Score'].between(670, 739)).astype(int)
    df['very_good_score'] = (df['Credit_Score'].between(740, 799)).astype(int)
    df['excellent_score'] = (df['Credit_Score'] >= 800).astype(int)
    df['unknown_region'] = ((df['Gender'] == 'Sex Not Available') & (df['Region'] == 'south')).astype(int)

In [None]:
make_features(train)
make_features(test)

In [None]:
submission.loc[test['heuristic'], 'Status'] = 1
train = train[~train['heuristic']]
test = test[~test['heuristic']]

### Model

In [None]:
preprocessing_cb = ColumnTransformer([
    ('cat', 'passthrough', cat_features),
    ('num_bin_q', KBinsDiscretizer(3, strategy='quantile', encode='onehot-dense'), num_features),
    ('num', 'passthrough', num_features)
])

model = catboost.CatBoostClassifier(
    iterations=1300,
    learning_rate=0.02,
    depth=7,
    one_hot_max_size=5,
    random_seed=42
)

pipeline_cb = Pipeline([
    ('preprocessing', preprocessing_cb),
    ('model', model)
])

fit_params = {
    'model__cat_features': np.arange(len(cat_features))
}

In [None]:
pipeline_cb.fit(train, train['Status'], model__cat_features=np.arange(len(cat_features)))

In [None]:
submission.loc[test.index, 'Status'] = pipeline_cb.predict_proba(test)[:, 1]
submission.to_csv('submission.csv', index=False)