# Предобработка данных для обучения

### Загрузка данных и первоначальная обработка

In [None]:
import pandas as pd
import pyarrow.parquet as pq
from collections import Counter

# Загрузка данных
df_train = pq.read_table("train_ai_comp_final_dp.parquet").to_pandas()
df_train = df_train.drop(["feature642", "feature756"], axis=1)

# Подготовка данных
X = df_train.drop(["id","sample_ml_new", "target"], axis=1)
y = df_train["target"]

# Подсчет количества примеров в каждом классе
print("До сэмплирования:", Counter(y))

### Сэмплирование данных

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Создание пайплайна с oversampling и undersampling
pipeline = Pipeline([
    ('oversample', SMOTE(sampling_strategy=1)),  # Увеличение примеров класса меньшинства до 50% от размера класса большинства
    ('undersample', RandomUnderSampler(sampling_strategy=1.0))  # Уменьшение примеров класса большинства до размера класса меньшинства
])

# Разделение данных на обучающий и валидационный наборы
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Применение пайплайна к данным
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

# Подсчет количества примеров в каждом классе после сэмплирования
print("После сэмплирования:", Counter(y_resampled))

# Обучение модели градиентного бустинга

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
import numpy as np

#Обучение----------------

# Создание DMatrix
dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dval = xgb.DMatrix(X_val, label=y_val)

# Параметры модели
params = {
    'max_depth': 6,
    'eta': 0.01,
    'objective': 'binary:logistic',
    'eval_metric': ['logloss', 'auc'],
    'gamma': 1,
    'reg_alpha': 0.4,
    'reg_lambda': 0.4
}


num_rounds = 10000


# Обучение модели на всех данных
bst = xgb.train(params, dtrain, num_rounds, [(dtrain, 'train'), (dval, 'test')], early_stopping_rounds=10)

# Оценка качества на всех данных
y_pred = bst.predict(dval)
y_pred = [1 if y > 0.5 else 0 for y in y_pred]
accuracy = accuracy_score(y_val, y_pred)
print("Метрики для обучения на всех данных:")
print(f"Точность: {accuracy}")
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Сохранение модели bst
bst.save_model('xgboost_before_del_f.model')

# Оценка важности признаков
importances = bst.get_score(importance_type='weight')
threshold = np.percentile(list(importances.values()), 25)  # Порог - 25-й квартиль

# Признаки для сохранения
features_to_keep = [feature for feature, importance in importances.items() if importance >= threshold]

# Фильтрация данных
X_train_filtered = X_train[features_to_keep]
X_test_filtered = X_val[features_to_keep]

# Создание DMatrix с отфильтрованными данными
dtrain_filtered = xgb.DMatrix(X_train_filtered, label=y_train)
dtest_filtered = xgb.DMatrix(X_test_filtered, label=y_val)

# Повторное обучение модели
bst_filtered = xgb.train(params, dtrain_filtered, num_rounds, [(dtrain_filtered, 'train'), (dtest_filtered, 'test')], early_stopping_rounds=10)

# Повторное предсказание
y_pred_filtered = bst_filtered.predict(dtest_filtered)
y_pred_filtered = [1 if y > 0.5 else 0 for y in y_pred_filtered]

# Повторная оценка модели
accuracy_filtered = accuracy_score(y_val, y_pred_filtered)
print("\nМетрики для обучения на отфильтрованных данных:")
print(f"Точность: {accuracy_filtered}")
print(confusion_matrix(y_val, y_pred_filtered))
print(classification_report(y_val, y_pred_filtered))

print("\nОтфильтрованные признаки:")
print(features_to_keep)
print(f"Количество отфильтрованных признаков: {len(features_to_keep)}")

# Сохранение модели bst
bst_filtered.save_model('xgboost_after_del_f.model')

## Проверка модели

In [None]:
# Загрузка модели
loaded_model = xgb.Booster()
loaded_model.load_model('xgboost_after_del_f.model')

file_path = "test_sber.parquet"

df = pq.read_table(file_path).to_pandas()

feature_fil = ['id', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature8', 'feature9',
               'feature12', 'feature16', 'feature17', 'feature18', 'feature19', 'feature21', 'feature22', 'feature24',
               'feature25', 'feature30', 'feature31', 'feature32', 'feature33', 'feature34', 'feature35', 'feature37',
               'feature38', 'feature40', 'feature41', 'feature43', 'feature44', 'feature45', 'feature46', 'feature47',
               'feature48', 'feature49', 'feature50', 'feature51', 'feature52', 'feature53', 'feature54', 'feature55',
               'feature56', 'feature57', 'feature59', 'feature62', 'feature63', 'feature64', 'feature67', 'feature71',
               'feature72', 'feature73', 'feature74', 'feature75', 'feature76', 'feature77', 'feature79', 'feature81',
               'feature83', 'feature84', 'feature86', 'feature88', 'feature89', 'feature90', 'feature91', 'feature93',
               'feature94', 'feature95', 'feature96', 'feature97', 'feature98', 'feature99', 'feature100', 'feature101',
               'feature102', 'feature103', 'feature104', 'feature105', 'feature106', 'feature107', 'feature108',
               'feature109', 'feature110', 'feature111', 'feature112', 'feature113', 'feature114', 'feature115',
               'feature116', 'feature117', 'feature118', 'feature119', 'feature120', 'feature121', 'feature122',
               'feature123', 'feature124', 'feature125', 'feature126', 'feature127', 'feature128', 'feature129',
               'feature130', 'feature132', 'feature133', 'feature134', 'feature135', 'feature136', 'feature137',
               'feature138', 'feature139', 'feature141', 'feature142', 'feature143', 'feature145', 'feature147',
               'feature148', 'feature149', 'feature150', 'feature151', 'feature152', 'feature153', 'feature154',
               'feature155', 'feature156', 'feature157', 'feature158', 'feature159', 'feature161', 'feature162',
               'feature163', 'feature164', 'feature165', 'feature166', 'feature167', 'feature168', 'feature169',
               'feature170', 'feature171', 'feature172', 'feature173', 'feature174', 'feature175', 'feature176',
               'feature177', 'feature178', 'feature179', 'feature180', 'feature181', 'feature182', 'feature183',
               'feature184', 'feature185', 'feature186', 'feature187', 'feature188', 'feature189', 'feature190',
               'feature191', 'feature192', 'feature193', 'feature194', 'feature195', 'feature196', 'feature197',
               'feature198', 'feature199', 'feature200', 'feature201', 'feature204', 'feature206', 'feature207',
               'feature208', 'feature209', 'feature210', 'feature212', 'feature214', 'feature217', 'feature218',
               'feature219', 'feature220', 'feature222', 'feature226', 'feature231', 'feature234', 'feature238',
               'feature252', 'feature253', 'feature257', 'feature260', 'feature262', 'feature263', 'feature264',
               'feature265', 'feature268', 'feature269', 'feature270', 'feature275', 'feature277', 'feature280',
               'feature282', 'feature283', 'feature284', 'feature286', 'feature287', 'feature288', 'feature290',
               'feature291', 'feature296', 'feature299', 'feature300', 'feature303', 'feature304', 'feature305',
               'feature308', 'feature309', 'feature310', 'feature313', 'feature315', 'feature316', 'feature317',
               'feature318', 'feature319', 'feature320', 'feature322', 'feature328', 'feature330', 'feature331',
               'feature332', 'feature334', 'feature335', 'feature336', 'feature338', 'feature339', 'feature340',
               'feature341', 'feature342', 'feature343', 'feature344', 'feature345', 'feature346', 'feature347',
               'feature348', 'feature349', 'feature350', 'feature351', 'feature352', 'feature353', 'feature354',
               'feature355', 'feature356', 'feature357', 'feature358', 'feature359', 'feature360', 'feature361',
               'feature362', 'feature366', 'feature367', 'feature368', 'feature369', 'feature370', 'feature371',
               'feature373', 'feature374', 'feature375', 'feature376', 'feature377', 'feature378', 'feature379',
               'feature383', 'feature384', 'feature385', 'feature386', 'feature395', 'feature396', 'feature398',
               'feature401', 'feature402', 'feature405', 'feature409', 'feature411', 'feature412', 'feature414',
               'feature415', 'feature416', 'feature417', 'feature421', 'feature422', 'feature426', 'feature427',
               'feature428', 'feature429', 'feature432', 'feature433', 'feature434', 'feature435', 'feature436',
               'feature437', 'feature438', 'feature440', 'feature441', 'feature442', 'feature443', 'feature444',
               'feature445', 'feature446', 'feature447', 'feature448', 'feature449', 'feature450', 'feature451',
               'feature452', 'feature453', 'feature454', 'feature455', 'feature456', 'feature457', 'feature458',
               'feature459', 'feature460', 'feature461', 'feature462', 'feature464', 'feature465', 'feature467',
               'feature468', 'feature469', 'feature470', 'feature472', 'feature473', 'feature474', 'feature475',
               'feature476', 'feature477', 'feature479', 'feature482', 'feature485', 'feature486', 'feature487',
               'feature488', 'feature489', 'feature491', 'feature493', 'feature494', 'feature495', 'feature497',
               'feature499', 'feature500', 'feature501', 'feature502', 'feature503', 'feature504', 'feature505',
               'feature506', 'feature507', 'feature508', 'feature509', 'feature510', 'feature512', 'feature513',
               'feature514', 'feature515', 'feature516', 'feature517', 'feature518', 'feature520', 'feature523',
               'feature524', 'feature525', 'feature526', 'feature527', 'feature528', 'feature529', 'feature530',
               'feature531', 'feature532', 'feature533', 'feature534', 'feature535', 'feature537', 'feature538',
               'feature539', 'feature540', 'feature541', 'feature542', 'feature543', 'feature544', 'feature545',
               'feature546', 'feature547', 'feature548', 'feature550', 'feature551', 'feature553', 'feature554',
               'feature557', 'feature559', 'feature560', 'feature561', 'feature564', 'feature569', 'feature572',
               'feature588', 'feature591', 'feature597', 'feature603', 'feature605', 'feature609', 'feature615',
               'feature620', 'feature624', 'feature626', 'feature632', 'feature646', 'feature647', 'feature651',
               'feature652', 'feature653', 'feature654', 'feature655', 'feature656', 'feature664', 'feature665',
               'feature668', 'feature675', 'feature676', 'feature677', 'feature680', 'feature687', 'feature688',
               'feature689', 'feature690', 'feature691', 'feature693', 'feature695', 'feature698', 'feature703',
               'feature712', 'feature713', 'feature714', 'feature715', 'feature716', 'feature721', 'feature726',
               'feature727', 'feature732', 'feature733', 'feature735', 'feature736', 'feature741', 'feature742',
               'feature745', 'feature749', 'feature750', 'feature751', 'feature753', 'feature754', 'feature755',
               'feature757', 'feature758', 'feature759', 'feature760', 'feature762', 'feature763', 'feature776',
               'feature777', 'feature781', 'feature782', 'feature783', 'feature784', 'feature787', 'feature788',
               'feature790', 'feature791', 'feature792', 'feature793', 'feature794', 'feature795', 'feature799',
               'feature800', 'feature805', 'feature810', 'feature811', 'feature812', 'feature813', 'feature814',
               'feature815', 'feature817', 'feature820', 'feature826', 'feature829', 'feature830', 'feature831',
               'feature842', 'feature849', 'feature850', 'feature853', 'feature854', 'feature856', 'feature857',
               'feature858', 'feature859', 'feature860', 'feature861', 'feature862', 'feature863', 'feature864',
               'feature865', 'feature867', 'feature868', 'feature869', 'feature870', 'feature871', 'feature872',
               'feature873', 'feature874', 'feature875', 'feature876', 'feature877', 'feature878', 'feature879',
               'feature887', 'feature888', 'feature890', 'feature891', 'feature892', 'feature893', 'feature894',
               'feature896', 'feature897', 'feature898', 'feature899', 'feature900', 'feature901', 'feature907',
               'feature908', 'feature909', 'feature911', 'feature913', 'feature915', 'feature916', 'feature917',
               'feature918', 'feature919', 'feature920', 'feature921', 'feature922', 'feature923', 'feature924',
               'feature925', 'feature927', 'feature928', 'feature930', 'feature932', 'feature933', 'feature934',
               'feature935', 'feature936', 'feature937', 'feature938', 'feature939', 'feature940', 'feature941',
               'feature942', 'feature943', 'feature944', 'feature945', 'feature946', 'feature947', 'feature948',
               'feature949', 'feature950', 'feature951', 'feature952', 'feature953', 'feature954', 'feature985',
               'feature986', 'feature987', 'feature988', 'feature989', 'feature990', 'feature991', 'feature992',
               'feature993', 'feature994', 'feature995', 'feature996', 'feature997', 'feature998', 'feature999',
               'feature1000', 'feature1001', 'feature1002', 'feature1003', 'feature1004', 'feature1035', 'feature1036',
               'feature1038', 'feature1039', 'feature1043', 'feature1055', 'feature1056', 'feature1057', 'feature1059',
               'feature1063', 'feature1064', 'feature1065', 'feature1066', 'feature1067', 'feature1068', 'feature1069']

print("Идет удаление ненужных признаков...")
df_test = df[feature_fil]
print("Удаление завершено\n")

X_test = df_test.drop(["id"], axis=1)

dtest = xgb.DMatrix(X_test)

# Предсказание
predictions = loaded_model.predict(dtest)
rounded_predictions = [round(pred) for pred in predictions]

count_of_1 = sum(rounded_predictions)
print("Количество 1: ",count_of_1)
print("Количество всего: ", len(predictions))

# Создание DataFrame с предсказаниями и id
df_predictions = pd.DataFrame({'id': df_test['id'],'target_bin': rounded_predictions, 'target_prob': predictions})

# Сохранение предсказаний в CSV файл
df_predictions.to_csv('test.csv', index=False)
