In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb


from tqdm import tqdm


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
train_path = '/content/drive/MyDrive/Data/AlphaHac/train.parquet'
test_path = '/content/drive/MyDrive/Data/AlphaHac/test.parquet'
sample_submission_path = '/content/drive/MyDrive/Data/AlphaHac/sample_submission.csv'

train = pd.read_parquet(train_path).drop('id', axis=1).drop_duplicates()
test = pd.read_parquet(test_path).drop('id', axis=1)
sample_submission = pd.read_csv(sample_submission_path)

In [14]:
drop_list = ['city', 'index_city_code', 'branch_code', 'cnt_days_cred_f_oper_1m', 'cnt_a_oper_1m', 'cnt_a_oper_3m', 'cnt_days_cred_g_oper_1m', 'cnt_deb_d_oper_1m', 'cnt_days_cred_g_oper_3m']

fill_na_list = ['max_end_plan_non_fin_deals', 'max_start_fin_deals', 'max_start_non_fin_deals', 'min_end_fact_fin_deals', 'min_start_fin_deals', 'min_start_non_fin_deals', 'max_founderpres', 'min_founderpres', 'max_end_fact_fin_deals', 'min_end_plan_non_fin_deals']

In [15]:
train = train.drop(drop_list, axis=1)
test = test.drop(drop_list, axis=1)

for ft in fill_na_list:
  train.loc[train[ft].isna(), ft] = 0
  test.loc[train[ft].isna(), ft] = 0


In [16]:
train = train[(train['rko_start_months'] > train['rko_start_months'].min() + 1) & (train['balance_amt_min'] > train['balance_amt_min'].min() + 1)]

In [17]:
cities_types = ['3597', '1252', '727', '5418', '3844']

train.loc[~train['city_type'].isin(cities_types), 'city_type'] = 0
test.loc[~test['city_type'].isin(cities_types), 'city_type'] = 0


In [18]:
channel_code_types = ['7', '4', '30', '26', '32', '40', '34', '33', '10', '37', '31',
       '48', '29', '2', '27', '11', '46', '22', '18', '20']

train.loc[~train['channel_code'].isin(channel_code_types), 'channel_code'] = 0
test.loc[~test['channel_code'].isin(channel_code_types), 'channel_code'] = 0

In [19]:
test['cnt_days_deb_g_oper_1m'] = test['cnt_days_deb_g_oper_1m'] + 1

test['cnt_days_deb_f_oper_3m'] = test['cnt_days_deb_f_oper_3m'] + 1
test['cnt_days_cred_h_oper_1m'] = test['cnt_days_cred_h_oper_1m'] + 2.57

test['cnt_days_deb_f_oper_1m'] = test['cnt_days_deb_f_oper_1m'] + 1
test['cnt_days_cred_f_oper_3m'] = test['cnt_days_cred_f_oper_3m'] + 1

In [20]:
cat_cols = ['channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'okved', 'segment']

train[cat_cols] = train[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")


In [21]:
params1 = {'n_estimators': 1000, 'reg_alpha': 0.7739759075026694, 'reg_lambda': 0.08210382093539982, 'learning_rate': 0.01921917403776698, 'max_depth': 8}
params2 = {'n_estimators': 700, 'reg_alpha': 0.0962, 'reg_lambda': 0.0984, 'learning_rate': 0.0157, 'max_depth': 8}

In [22]:
model1 = lgb.LGBMClassifier(verbose=-1, random_state=42, **params1)
model1.fit(train.drop(['target_1', 'target_2', 'total_target'], axis=1), train['target_1'])


In [23]:
model2 = lgb.LGBMClassifier(verbose=-1, random_state=42, **params2)

model2.fit(train.drop(['target_1', 'target_2', 'total_target'], axis=1), train['target_2'])


In [24]:
y_test_1 = model1.predict_proba(test)[:, 1]
y_test_2 = model2.predict_proba(test)[:, 1]

y_res = y_test_1 + y_test_2 - y_test_1 * y_test_2

In [25]:
sample_submission["score"] = y_res
sample_submission.head()
sample_submission.to_csv("my_submission.csv", index=False)

In [26]:
sample_submission

Unnamed: 0,id,score
0,360000,0.035753
1,360001,0.077113
2,360002,0.137808
3,360003,0.098242
4,360004,0.065622
...,...,...
99995,459995,0.047669
99996,459996,0.035160
99997,459997,0.010140
99998,459998,0.030315
