In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
import lightgbm as lgb 
import optuna

In [6]:
X_train_selected = pd.read_csv('rfe_train.csv')
X_test_selected = pd.read_csv('rfe_test.csv')

In [7]:
X_train_selected.columns = np.arange(0, 26)
X_test_selected.columns = np.arange(0, 26)

X_train_selected = X_train_selected.drop(0, axis=1)
X_test_selected = X_test_selected.drop(0, axis=1)

In [9]:
train_data = pd.read_csv('C:\\Users\\akayo\\Hacks\\Alpha_data\\train_1.csv')

In [17]:
X = train_data.drop(columns=['id', 'smpl', 'target']).values
y = train_data['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 15),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 0.05),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 0.05)
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train_selected, y_train)
    y_pred_proba = model.predict_proba(X_test_selected)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    return roc_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print("Best Parameters:", best_params)

In [19]:
best_model = lgb.LGBMClassifier()
best_model.fit(X_train, y_train)

y_pred_proba = best_model.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC AUC with Best Parameters: {roc_auc}')

[LightGBM] [Info] Number of positive: 4125, number of negative: 355111
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.385684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 118911
[LightGBM] [Info] Number of data points in the train set: 359236, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011483 -> initscore=-4.455364
[LightGBM] [Info] Start training from score -4.455364
ROC AUC with Best Parameters: 0.8203087274686309


In [21]:
feature_importances = best_model.feature_importances_

In [49]:
df = pd.DataFrame(feature_importances)
df.columns = ['importance']
df = df.sort_values(by='importance', ascending=False)

df[df['importance'] > 10].shape

(45, 1)