In [None]:
import os
import numpy as np
import pandas as pd
import warnings
import pickle
from sklearn.preprocessing import StandardScaler

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#graph, plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import learning_curve
#import scikitplot as skplt
from skopt.plots import plot_evaluations

#building models
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
from lightgbm.basic import LightGBMError
from sklearn.linear_model import LogisticRegression
#from statsmodels.graphics.gofplots import qqplot
#metrics 
from sklearn.metrics import roc_auc_score, roc_curve,accuracy_score,log_loss
from sklearn.metrics import f1_score,precision_score, recall_score
from sklearn.metrics import precision_recall_curve,average_precision_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,balanced_accuracy_score
from sklearn import metrics
from sklearn.utils import shuffle
warnings.simplefilter(action='ignore', category=FutureWarning)



In [None]:
%%time
df=pd.read_csv("/home/anjali/Desktop/MolTox/statistical_correction/bonferroni/train_test_data_10449_bonferroni.csv")


In [None]:
df.fillna(0,inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
# desc=pd.read_csv("/media/anjali/Data/revised/new_padel_fingerprints_10449/selected/desc_58.csv",index_col=0)
desc=pd.read_csv("/media/anjali/Data/revised/new_padel_fingerprints_10449/selected/desc_pearson_0.9.csv",index_col=0)

In [None]:
categorical=df[df.columns.difference(desc.columns)]

In [None]:
categorical=categorical.drop('Toxicity',axis=1)
categorical

In [None]:
categorical=df[categorical.columns].astype('category')

In [None]:
categorical.columns

In [None]:
y=df['Toxicity']
X=df.drop(['Toxicity'],axis=1)

In [None]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Select only the numerical columns
numerical_columns = X.columns[(X.dtypes == 'float64') & (X.columns != 'IPC')]

# Create a DataFrame with only the numerical columns
numerical_X = X[numerical_columns]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numerical data and transform it for the train set
scaled_numerical_X_train = scaler.fit_transform(numerical_X.loc[X_train.index])

# Replace the scaled numerical data back into the original DataFrame for the train set
X.loc[X_train.index, numerical_columns] = scaled_numerical_X_train

# Transform the numerical data for the test set
scaled_numerical_X_test = scaler.transform(numerical_X.loc[X_test.index])

# Replace the scaled numerical data back into the original DataFrame for the test set
X.loc[X_test.index, numerical_columns] = scaled_numerical_X_test


In [None]:
df=pd.concat([X,y],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train,test=train_test_split(df,test_size=0.20,random_state=42)

## Hyperparameter Tuning

In [None]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def bayes_parameter_opt_lgb(X_train, y_train, init_round=5, opt_round=10, n_folds=5, random_seed=6, n_estimators=100, output_process=False):
    # Prepare data
    train_data = lgb.Dataset(data=X_train.copy(), label=y_train.copy(), categorical_feature='auto', params={'verbose': -1})
    # Parameters
    #callbacks = [lgb.early_stopping(early_stopping)]
    
    def lgb_eval(learning_rate, num_leaves, max_depth, min_child_samples, min_child_weight, subsample, colsample_bytree,lambda_l2
                ):
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'dart',
            'verbose': -1,
            'learning_rate': max(min(learning_rate, 1), 0),
            'num_leaves': int(round(num_leaves)),
            'max_depth': int(round(max_depth)),
            'min_child_samples': int(round(min_child_samples)),
            'min_child_weight': min_child_weight,
            'subsample': max(min(subsample, 1), 0),
            'colsample_bytree': max(min(colsample_bytree, 1), 0),
            'lambda_l2' : max(min(lambda_l2, 1), 0),
            'feature_pre_filter': False
        }
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval=False, metrics=['auc'])
        return max(cv_result['auc-mean'])
   
    lgbBO = BayesianOptimization(lgb_eval, {
        'learning_rate': (0.001, 0.01),
        'num_leaves': (31, 40),
        'max_depth': (3, 10),
        'min_child_samples': (20, 50),
        'min_child_weight': (0.001, 0.1),
        'subsample': (0.5, 0.9),
        'colsample_bytree': (0.5, 0.9),
        'lambda_l2': (0.05, 0.8),  # Set bounds for lambda_l1
    }, random_state=200)
    
    # Bayesian Optimization: Maximize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc = []
    for model in range(len(lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # Return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'], lgbBO.res[pd.Series(model_auc).idxmax()]['params']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Bayesian optimization
opt_auc, opt_params = bayes_parameter_opt_lgb(X_train, y_train, init_round=5, opt_round=10, n_folds=5, random_seed=13, n_estimators=100)

In [None]:
# Convert parameter values to the appropriate types
opt_params['num_leaves'] = int(round(opt_params['num_leaves']))
opt_params['max_depth'] = int(round(opt_params['max_depth']))
opt_params['min_child_samples'] = int(round(opt_params['min_child_samples']))

# Set additional parameters
opt_params['objective'] = 'binary'
opt_params['metric'] = 'auc'
opt_params['is_unbalance'] = True
opt_params['boost_from_average'] = False

# Use the optimized parameters
opt_params