In [17]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

np.random.seed(31415)

In [2]:
# getting data
PATH = 'Database_'
POSTFIX = '.xlsx'

df1 = pd.read_excel(PATH + '1' + POSTFIX)
df2 = pd.read_excel(PATH + '2' + POSTFIX)
df3 = pd.read_excel(PATH + '3' + POSTFIX)
df4 = pd.read_excel(PATH + '4' + POSTFIX)
df5 = pd.read_excel(PATH + '5' + POSTFIX)

In [3]:
# rename similar columns
dict1 = {
    'Material type' : 'material_type',
    'Exposure dose (ug/mL)' : 'exposure_dose',
    'Elements' : 'elements',
    'Cell type' : 'cell_type',
    'Cell line (L)/primary cells (P)' : 'cell_line',
    'Viability (%)' : 'cell_viability',
    'Hydro size (nm)' : 'hydro_size',
    'Surface charge (mV)' : 'surf_charge'
    }

dict2 = {
    'Nanoparticle' : 'material_type',
    'Concentration μM' : 'exposure_dose',
    'Cells' : 'cell_line',
    'coat' : 'coat',
    'Human(H)/Animal(A) cells' : 'human_animal_cells',
    'Exposure time (h)' : 'exposure_time',
    '% Cell viability' : 'cell_viability',
    'Diameter (nm)' : 'hydro_size',
    'Zeta potential (mV)' : 'surf_charge'
    }

dict3 = {
    'Material' : 'material_type',
    'Concentration (ug/ml)' : 'exposure_dose',
    'Cell_Type' : 'cell_line',
    'Cell Line_Primary Cell' : 'cell_is_line',
    'Coat/Functional Group' : 'coat',
    'Human_Animal' : 'human_animal_cells',
    'Time (hr)' : 'exposure_time',
    'Cell_Viability (%)' : 'cell_viability',
    'Diameter (nm)' : 'hydro_size',
    'Surface_Charge' : 'surf_charge'
    }

dict4 = {
    'Material type' : 'material_type',
    'Exposure dose (ug/mL)' : 'exposure_dose',
    'Elements' : 'elements',
    'Cell type' : 'cell_type',
    'Viability (%)' : 'cell_viability',
    'Hydro size (nm)' : 'hydro_size',
    'Surface charge (mV)' : 'surf_charge'
    }

dict5 = {
    'material' : 'material_type',
    'dose' : 'exposure_dose',
    'cell_type' : 'cell_type',
    'cell_line' : 'cell_line',
    'cell_species' : 'human_animal_cells',
    'time' : 'exposure_time',
    'viability' : 'cell_viability',
    'hydro_size' : 'hydro_size',
    'surf_charge' : 'surf_charge'
    }

new_df1 = df1.rename(columns=dict1)
new_df2 = df2.rename(columns=dict2)
new_df3 = df3.rename(columns=dict3)
new_df4 = df4.rename(columns=dict4)
new_df5 = df5.rename(columns=dict5)

In [4]:
# add db flags

new_df1['data_base'] = 1
new_df2['data_base'] = 2
new_df3['data_base'] = 3
new_df4['data_base'] = 4
new_df5['data_base'] = 5

new_df1.loc[new_df1['hydro_size'].isnull(), 'hydro_size'] = new_df1['hydro_size'].mean()
new_df2.loc[new_df2['hydro_size'].isnull(), 'hydro_size']  = new_df2['hydro_size'].mean()
new_df3.loc[new_df3['hydro_size'].isnull(), 'hydro_size']  = new_df3['hydro_size'].mean()
new_df4.loc[new_df4['hydro_size'].isnull(), 'hydro_size']  = new_df4['hydro_size'].mean()
new_df5.loc[new_df5['hydro_size'].isnull(), 'hydro_size']  = new_df5['hydro_size'].mean()

new_df1.loc[new_df1['surf_charge'].isnull(), 'surf_charge'] = new_df1['surf_charge'].mean()
new_df2.loc[new_df2['surf_charge'].isnull(), 'surf_charge'] = new_df2['surf_charge'].mean()

new_df4.loc[new_df4['surf_charge'].isnull(), 'surf_charge'] = new_df4['surf_charge'].mean()
new_df5.loc[new_df5['surf_charge'].isnull(), 'surf_charge'] = new_df5['surf_charge'].mean()

In [6]:
# union data
full_df = pd.concat([new_df1, new_df2, new_df3, new_df4, new_df5])[['material_type', 'exposure_dose', 'cell_type', #'cells', 
                                                         'cell_line', 'coat', 'human_animal_cells', 'exposure_time', 'data_base', 
                                                         'hydro_size', 
                                                        'surf_charge',
                                                         'cell_viability']]

In [7]:
# make the data similar
full_df.loc[(full_df['human_animal_cells'] == 'Mouse') | 
            (full_df['human_animal_cells'] == 'Hamster'), 'human_animal_cells'] = 'A'
full_df.loc[full_df['human_animal_cells'] == 'Human', 'human_animal_cells'] = 'H'

In [8]:
full_df.fillna(0, inplace=True)

### trying to  predict with db 3

In [19]:
cut_df3 = new_df3[['material_type', 
                   'exposure_dose', 
                   'cell_line', 
                   'coat', 
                   'human_animal_cells', 
                   'exposure_time', 
                   'hydro_size',
                   'surf_charge',
                   'cell_viability']]

# drop null exp_dose, 99 quantille exp_dose и viability
clear_cut_df3 = cut_df3[(cut_df3['exposure_dose'].notna()) &
                     (cut_df3['exposure_dose'] < cut_df3['exposure_dose'].quantile(0.99)) &
                     (cut_df3['cell_viability'] < cut_df3['cell_viability'].quantile(0.99))].copy()


# OHE
cut_df3_gd = pd.get_dummies(clear_cut_df3, sparse=True)

# train test split
splitted_data = train_test_split(cut_df3_gd.drop(['cell_viability'], axis=1), 
                                 cut_df3_gd['cell_viability'], 
                                 test_size=0.3, 
                                 random_state=(31415))
X_train, X_test, y_train, y_test = splitted_data


# training linear regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# predict
y_pred = regressor.predict(X_test)

# evaluate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(mae, rmse)



17.729306107905874 23.537288090809717




### trying to predict with all databases

In [20]:
cut_df3 = full_df[(full_df['data_base'] == 1) | (full_df['data_base'] == 4)][['material_type', 
                   'exposure_dose', 
                   'cell_line', 
                   'coat', 
                   'human_animal_cells', 
                   'exposure_time', 
                   'hydro_size',
                   'surf_charge',
                   'cell_viability', 
                  ]]

# drop null exp_dose, 99 quantille exp_dose и viability
clear_cut_df3 = cut_df3[(cut_df3['exposure_dose'].notna()) &
                     (cut_df3['exposure_dose'] < cut_df3['exposure_dose'].quantile(0.95)) &
                     (cut_df3['cell_viability'] < cut_df3['cell_viability'].quantile(0.95))].copy()


# OHE
cut_df3_gd = pd.get_dummies(clear_cut_df3, sparse=True)

# train test split
splitted_data = train_test_split(cut_df3_gd.drop(['cell_viability'], axis=1), 
                                 cut_df3_gd['cell_viability'], 
                                 test_size=0.3, 
                                 random_state=(31415))
X_train, X_test, y_train, y_test = splitted_data



# training XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

# predict
y_pred = regressor.predict(X_test)

# evaluate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(mae, rmse)

  cut_df3_gd = pd.get_dummies(clear_cut_df3, sparse=True)


9.027883480242604 13.221811700528198
