In [1]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import optuna 


In [2]:
data=pd.read_csv("Silver nanomaterils antibacterial  .csv")
data.head()

Unnamed: 0,Process,Steps,External_energy,Temp_Celcius,Stirring,Synthesis_Duration_h,Scale_synthesis_ml,Precurcor_conc_mM,NEW_Capping agent,NEW_capping agent_Class,...,treatment,UVVIs PEAKS nm,shape,core size (nm),Method of determination_size,exposure dose concentration mg/mL,Exposure duration (h),bacterial Culture medium,bacterial Species,Bacteria reduction mm
0,phytosynthesis,single,No,25,No,4,?,7.0,plant,organic,...,no,430,spherical,25,SEM,0.01125,?,MuellerHinton,Enterococcus,6.0
1,phytosynthesis,single,No,25,No,4,?,7.0,plant,organic,...,no,430,spherical,25,SEM,0.0225,?,MuellerHinton,Enterococcus,6.0
2,phytosynthesis,single,No,25,No,4,?,7.0,plant,organic,...,no,430,spherical,25,SEM,0.01125,?,MuellerHinton,Enterococcus,6.0
3,phytosynthesis,single,Heating,80,Stirring,2,20,1.5,plant,organic,...,no,420,spherical,10,TEM,0.08,24,MuellerHinton,Salmonella,0.0
4,phytosynthesis,single,Heating,80,Stirring,2,20,1.5,plant,organic,...,no,420,spherical,10,TEM,0.04,24,MuellerHinton,Salmonella,0.0


In [3]:
missing_values = (data == '?').sum()/len(data) *100
print(missing_values)

Process                               0.000000
Steps                                 0.000000
External_energy                       0.000000
Temp_Celcius                          3.756994
Stirring                              0.000000
Synthesis_Duration_h                 11.350919
Scale_synthesis_ml                   40.607514
Precurcor_conc_mM                     2.398082
NEW_Capping agent                     0.000000
NEW_capping agent_Class               0.959233
NEW_reducing agent                    1.438849
NEW_reducing agent_class              1.438849
Capping agent concentration_mg/mL    34.052758
Reducing agent quantity mL           41.646683
Order of reagent_CODE                 7.114309
treatment                             0.159872
UVVIs PEAKS nm                        6.075140
shape                                 9.192646
core size (nm)                        9.352518
Method of determination_size         10.311751
exposure dose concentration mg/mL    16.227018
Exposure dura

In [4]:

categorical_cols = ['Process',"NEW_Capping agent", "NEW_reducing agent_class" ,"Order of reagent_CODE",'Steps', 'External_energy', 'Stirring', 'NEW_Capping agent', 'NEW_capping agent_Class', 'treatment', 'shape', 'Method of determination_size', 'bacterial  Culture medium', 'bacterial  Species']
data.replace('?', np.nan, inplace=True)

for col in categorical_cols:
    data[col] = data[col].astype('category')
object_cols = data.select_dtypes(['object']).columns

# Convert object columns to numeric
for cols in object_cols:
    try:
        data[cols] = pd.to_numeric(data[cols])
    except:
        pass
        #print(cols)
#data["Temp_Celcius"] = pd.to_numeric(data["Temp_Celcius"])
data["NEW_reducing agent"]=data["NEW_reducing agent"].astype("category")
print(data.dtypes)

Process                              category
Steps                                category
External_energy                      category
Temp_Celcius                          float64
Stirring                             category
Synthesis_Duration_h                  float64
Scale_synthesis_ml                    float64
Precurcor_conc_mM                     float64
NEW_Capping agent                    category
NEW_capping agent_Class              category
NEW_reducing agent                   category
NEW_reducing agent_class             category
Capping agent concentration_mg/mL     float64
Reducing agent quantity mL            float64
Order of reagent_CODE                category
treatment                            category
UVVIs PEAKS nm                        float64
shape                                category
core size (nm)                        float64
Method of determination_size         category
exposure dose concentration mg/mL     float64
Exposure duration  (h)            

In [5]:
X=data.drop(columns=["Bacteria reduction mm"])
Y=data["Bacteria reduction mm"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=666)

In [16]:
#data2=data.copy()
categoricical_COLS=[i for i in data.columns if data[i].dtype.name == 'category']
label_encoder = OneHotEncoder()

label_encoder.fit(X_train[categoricical_COLS])
X_train_encoded = label_encoder.transform(X_train[categoricical_COLS])
X_test_encoded = label_encoder.transform(X_test[categoricical_COLS])

X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=label_encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=label_encoder.get_feature_names_out())
X_train_encoded.head()

Unnamed: 0,Process_biosynthesis,Process_green,Process_micosynthesis,Process_phytochemical,Process_phytosynthesis,Process_wet chemical,Steps_multi,Steps_single,External_energy_Autoclave,External_energy_Heating,...,bacterial Species_Rhizopus oligosporus,bacterial Species_Saccharomyces,bacterial Species_Salmonella,bacterial Species_Setosphaeria turcica,bacterial Species_Shigella flexneri,bacterial Species_Staphylococcus,bacterial Species_Streptococcus,bacterial Species_Trichoderma,bacterial Species_Vibrio,bacterial Species_Xanthomonas phaseoli pv. phaseoli
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
