In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Lipinski, Descriptors

from mining_cleaning import *

## Выгружаем датасеты

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/dataconHack/hackathon/main/data.csv')
drug = pd.read_csv('https://raw.githubusercontent.com/dataconHack/hackathon/main/drug_descriptors.csv')
bacterial = pd.read_csv('https://raw.githubusercontent.com/dataconHack/hackathon/main/bacterial_descriptors.csv')

## Очистка датасетов

In [3]:
bacterial.columns

Index(['Tax_id', 'Bacteria', 'kingdom', 'subkingdom', 'clade', 'phylum',
       'class', 'order', 'family', 'genus', 'species', 'gram',
       'min_Incub_period, h', 'avg_Incub_period, h', 'max_Incub_period, h',
       'growth_temp, C', 'biosafety_level', 'isolated_from'],
      dtype='object')

In [4]:
# Список колонок, которые нужно дропнуть
data_drop_lst = ['Unnamed: 0.1', 'Unnamed: 0', 'Drug_class_drug_bank',
                 'method', 'MDR_check', 'NP_Synthesis', 'ZOI_NP', 'NP_concentration']
drug_drop_lst = ['Unnamed: 0', 'chemID', 'prefered_name']
bacterial_drop_lst = ['Tax_id', 'subkingdom', 'clade', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'isolated_from']

data, drug, bacterial = drop_columns([data, drug, bacterial], [data_drop_lst, drug_drop_lst, bacterial_drop_lst])

In [5]:
data

Unnamed: 0,Bacteria,Drug,Drug_dose,NP size_min,NP size_max,NP size_avg,shape,ZOI_drug,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%)
0,Escherichia coli,Amoxicillin,,4.0,10.0,8.00,spherical,32+,32+,
1,Escherichia coli,Penicillin,,4.0,10.0,8.00,spherical,32+,32+,
2,Escherichia coli,Amoxicillin,,15.0,30.0,28.00,spherical,32+,32+,
3,Escherichia coli,Penicillin,,15.0,30.0,28.00,spherical,32+,32+,
4,Micrococcus luteus,Vancomycin,30.0,5.0,30.0,21.00,spherical,0,17+2,7.02
...,...,...,...,...,...,...,...,...,...,...
878,Staphylococcus aureus,,,10.0,78.9,44.45,spherical,,,
879,Acinetobacter baumanii,,,10.0,78.9,44.45,spherical,,,
880,Enterococcus faecalis,,,20.0,20.0,20.00,spherical,,,
881,Enterococcus faecalis,,,20.0,20.0,20.00,spherical,,,


## Заполнение дескрипторами

In [6]:
drug = fill_df_descriptors(drug)
drug

Unnamed: 0,drug,smiles,NumHDonors,NumHAcceptors,NumHeteroatoms,ExactMolWt,MaxAbsPartialCharge,MaxPartialCharge,MinAbsPartialCharge,MinPartialCharge,NumRotatableBonds,NumAromaticRings,NumAromaticHeterocycles
0,Amoxicillin,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,4,6,9,365.104542,0.507967,0.327399,0.327399,-0.507967,4,1,0
1,Penicillin,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,1,5,8,372.05461,1.0,1.0,0.547832,-0.547832,4,1,0
2,Vancomycin,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...,19,25,35,1447.4302,0.50781,0.330441,0.330441,-0.50781,13,5,0
3,Gentamicin,CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...,8,12,12,477.316249,0.387562,0.185232,0.185232,-0.387562,7,0,0
4,Ceftazidime,CC(C)(O/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=...,3,11,15,546.099139,0.543191,0.349825,0.349825,-0.543191,9,2,2
5,Ampicillin,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccccc3)C(=...,3,5,8,349.109627,0.479673,0.327399,0.327399,-0.479673,4,1,0
6,Faropenem,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C([C@H]3CCCO3...,2,5,7,285.067094,0.476538,0.353083,0.353083,-0.476538,3,0,0
7,Ceftriaxone,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nc...,4,15,18,554.046058,0.488304,0.352159,0.352159,-0.488304,8,2,2
8,Rifampicin,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(c(/C...,6,15,16,822.405123,0.506736,0.312109,0.312109,-0.506736,4,2,0
9,Azithromycin,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,5,14,14,748.508526,0.458936,0.311188,0.311188,-0.458936,7,0,0


In [7]:
bacterial = fix_string(bacterial, 'Bacteria')
drug = fix_string(drug, 'drug')

data = fix_string(data, 'Drug')
data = fix_string(data, 'Bacteria')

  df[col] = df[col].str.replace('.','')


In [8]:
data

Unnamed: 0,Bacteria,Drug,Drug_dose,NP size_min,NP size_max,NP size_avg,shape,ZOI_drug,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%)
0,escherichiacoli,amoxicilin,,4.0,10.0,8.00,spherical,32+,32+,
1,escherichiacoli,penicilin,,4.0,10.0,8.00,spherical,32+,32+,
2,escherichiacoli,amoxicilin,,15.0,30.0,28.00,spherical,32+,32+,
3,escherichiacoli,penicilin,,15.0,30.0,28.00,spherical,32+,32+,
4,micrococusluteus,vancomycin,30.0,5.0,30.0,21.00,spherical,0,17+2,7.02
...,...,...,...,...,...,...,...,...,...,...
878,staphylococusaureus,,,10.0,78.9,44.45,spherical,,,
879,acinetobacterbaumani,,,10.0,78.9,44.45,spherical,,,
880,enterococusfaecalis,,,20.0,20.0,20.00,spherical,,,
881,enterococusfaecalis,,,20.0,20.0,20.00,spherical,,,


## Объединение датасетов

In [9]:
df_main = zip_data(data, drug, bacterial)

In [10]:
# Удаляем в ZOI все символы, стоящие после + (включительно)

df_main['ZOI_drug_NP'].replace(['32+'], 32, inplace=True)
df_main['ZOI_drug_NP'].replace(['17+2'], 17, inplace=True)

df_main['ZOI_drug'].replace(['32+'], 17, inplace=True)

In [11]:
df_main = type_converting(df_main, ['NumHeteroatoms', 'NumHAcceptors', 'NumHDonors', 'growth_temp, C',
                                    'biosafety_level', 'ZOI_drug_NP', 'ZOI_drug', 'NumRotatableBonds',
                                    'NumAromaticRings', 'NumAromaticHeterocycles'])

In [12]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Data columns (total 29 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Bacteria                                     883 non-null    object 
 1   Drug                                         642 non-null    object 
 2   Drug_dose                                    577 non-null    float64
 3   NP size_min                                  883 non-null    float64
 4   NP size_max                                  883 non-null    float64
 5   NP size_avg                                  883 non-null    float64
 6   shape                                        883 non-null    object 
 7   ZOI_drug                                     627 non-null    float64
 8   ZOI_drug_NP                                  584 non-null    float64
 9   fold_increase_in_antibacterial_activity (%)  421 non-null    float64
 10  sm

In [13]:
df_main = df_main[~df_main['Bacteria'].isna()]
df_main = df_main[~df_main['max_Incub_period, h'].isna()]

df_main = df_main[~((df_main['ZOI_drug_NP'].isna()))]
df_main = df_main[~((df_main['ZOI_drug_NP'].isna()) & (df_main['ZOI_drug'].isna()))]
df_main = df_main[~((df_main['ZOI_drug'].isna()))]

df_main = df_main[~df_main['ExactMolWt'].isna()]

df_main = df_main[~df_main['ZOI_drug_NP'].isna()]

In [14]:
df_main

Unnamed: 0,Bacteria,Drug,Drug_dose,NP size_min,NP size_max,NP size_avg,shape,ZOI_drug,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%),...,NumRotatableBonds,NumAromaticRings,NumAromaticHeterocycles,kingdom,gram,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level
0,escherichiacoli,amoxicilin,,4.0,10.0,8.0,spherical,17.0000,32.000,,...,4.0,1.0,0.0,Bacteria,n,48.0,84.0,120.0,37.0,2.0
1,escherichiacoli,penicilin,,4.0,10.0,8.0,spherical,17.0000,32.000,,...,4.0,1.0,0.0,Bacteria,n,48.0,84.0,120.0,37.0,2.0
2,escherichiacoli,amoxicilin,,15.0,30.0,28.0,spherical,17.0000,32.000,,...,4.0,1.0,0.0,Bacteria,n,48.0,84.0,120.0,37.0,2.0
3,escherichiacoli,penicilin,,15.0,30.0,28.0,spherical,17.0000,32.000,,...,4.0,1.0,0.0,Bacteria,n,48.0,84.0,120.0,37.0,2.0
4,micrococusluteus,vancomycin,30.0,5.0,30.0,21.0,spherical,0.0000,17.000,7.02,...,13.0,5.0,0.0,Bacteria,p,240.0,288.0,336.0,30.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,staphylococusaureus,penicilin,,4.0,10.0,8.0,spherical,1.0000,0.125,,...,4.0,1.0,0.0,Bacteria,p,24.0,36.0,48.0,37.0,2.0
580,pseudomonasaeruginosa,mupirocin,,5.0,80.0,60.0,spherical,0.3125,0.100,,...,4.0,1.0,0.0,Bacteria,n,12.0,18.0,24.0,37.0,2.0
581,pasteurelamultocida,penicilin,,4.0,10.0,8.0,spherical,0.0300,0.030,,...,7.0,0.0,0.0,Bacteria,p,24.0,36.0,48.0,37.0,2.0
582,pasteurelamultocida,penicilin,,15.0,30.0,28.0,spherical,0.0300,0.030,,...,7.0,0.0,0.0,Bacteria,n,24.0,48.0,72.0,37.0,2.0


In [15]:
df_main.columns

Index(['Bacteria', 'Drug', 'Drug_dose', 'NP size_min', 'NP size_max',
       'NP size_avg', 'shape', 'ZOI_drug', 'ZOI_drug_NP',
       'fold_increase_in_antibacterial_activity (%)', 'smiles', 'NumHDonors',
       'NumHAcceptors', 'NumHeteroatoms', 'ExactMolWt', 'MaxAbsPartialCharge',
       'MaxPartialCharge', 'MinAbsPartialCharge', 'MinPartialCharge',
       'NumRotatableBonds', 'NumAromaticRings', 'NumAromaticHeterocycles',
       'kingdom', 'gram', 'min_Incub_period, h', 'avg_Incub_period, h',
       'max_Incub_period, h', 'growth_temp, C', 'biosafety_level'],
      dtype='object')

In [16]:
#ONE-HOT ENCODING ПРЕОБРАЗОВАНИЕ


#Столбец ['shape']

df_main['shape_spherical'] = df_main['shape']	
df_main['shape_nanorods_and_triangles'] = df_main['shape']
df_main['shape_triangular'] = df_main['shape']
df_main = df_main.drop('shape', axis=1)

df_main['shape_spherical'].replace(['nanorods and triangles', 'triangular'], 0, inplace=True)
df_main['shape_spherical'].replace(['spherical'], 1, inplace=True)

df_main['shape_nanorods_and_triangles'].replace(['spherical', 'triangular'], 0, inplace=True)
df_main['shape_nanorods_and_triangles'].replace(['nanorods and triangles'], 1, inplace=True)

df_main['shape_triangular'].replace(['nanorods and triangles', 'spherical'], 0, inplace=True)
df_main['shape_triangular'].replace(['triangular'], 1, inplace=True)

#Столбец ['gram']

df_main['gram_p'] = df_main['gram']	
df_main = df_main.drop('gram', axis=1)

df_main['gram_p'].replace(['n', np.nan], 0, inplace=True)
df_main['gram_p'].replace(['p'], 1, inplace=True)


In [17]:
df_main

Unnamed: 0,Bacteria,Drug,Drug_dose,NP size_min,NP size_max,NP size_avg,ZOI_drug,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%),smiles,...,kingdom,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,shape_spherical,shape_nanorods_and_triangles,shape_triangular,gram_p
0,escherichiacoli,amoxicilin,,4.0,10.0,8.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
1,escherichiacoli,penicilin,,4.0,10.0,8.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
2,escherichiacoli,amoxicilin,,15.0,30.0,28.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
3,escherichiacoli,penicilin,,15.0,30.0,28.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
4,micrococusluteus,vancomycin,30.0,5.0,30.0,21.0,0.0000,17.000,7.02,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...,...,Bacteria,240.0,288.0,336.0,30.0,1.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,staphylococusaureus,penicilin,,4.0,10.0,8.0,1.0000,0.125,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,24.0,36.0,48.0,37.0,2.0,1,0,0,1
580,pseudomonasaeruginosa,mupirocin,,5.0,80.0,60.0,0.3125,0.100,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,12.0,18.0,24.0,37.0,2.0,1,0,0,0
581,pasteurelamultocida,penicilin,,4.0,10.0,8.0,0.0300,0.030,,CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...,...,Bacteria,24.0,36.0,48.0,37.0,2.0,1,0,0,1
582,pasteurelamultocida,penicilin,,15.0,30.0,28.0,0.0300,0.030,,CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...,...,Bacteria,24.0,48.0,72.0,37.0,2.0,1,0,0,0


In [18]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 537 entries, 0 to 583
Data columns (total 31 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Bacteria                                     537 non-null    object 
 1   Drug                                         537 non-null    object 
 2   Drug_dose                                    423 non-null    float64
 3   NP size_min                                  537 non-null    float64
 4   NP size_max                                  537 non-null    float64
 5   NP size_avg                                  537 non-null    float64
 6   ZOI_drug                                     537 non-null    float64
 7   ZOI_drug_NP                                  537 non-null    float64
 8   fold_increase_in_antibacterial_activity (%)  394 non-null    float64
 9   smiles                                       537 non-null    object 
 10  Nu

In [19]:
df_main = df_main[df_main['kingdom']=='Bacteria']

In [20]:
df_main

Unnamed: 0,Bacteria,Drug,Drug_dose,NP size_min,NP size_max,NP size_avg,ZOI_drug,ZOI_drug_NP,fold_increase_in_antibacterial_activity (%),smiles,...,kingdom,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,shape_spherical,shape_nanorods_and_triangles,shape_triangular,gram_p
0,escherichiacoli,amoxicilin,,4.0,10.0,8.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
1,escherichiacoli,penicilin,,4.0,10.0,8.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
2,escherichiacoli,amoxicilin,,15.0,30.0,28.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
3,escherichiacoli,penicilin,,15.0,30.0,28.0,17.0000,32.000,,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,...,Bacteria,48.0,84.0,120.0,37.0,2.0,1,0,0,0
4,micrococusluteus,vancomycin,30.0,5.0,30.0,21.0,0.0000,17.000,7.02,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...,...,Bacteria,240.0,288.0,336.0,30.0,1.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,staphylococusaureus,penicilin,,4.0,10.0,8.0,1.0000,0.125,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,24.0,36.0,48.0,37.0,2.0,1,0,0,1
580,pseudomonasaeruginosa,mupirocin,,5.0,80.0,60.0,0.3125,0.100,,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,...,Bacteria,12.0,18.0,24.0,37.0,2.0,1,0,0,0
581,pasteurelamultocida,penicilin,,4.0,10.0,8.0,0.0300,0.030,,CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...,...,Bacteria,24.0,36.0,48.0,37.0,2.0,1,0,0,1
582,pasteurelamultocida,penicilin,,15.0,30.0,28.0,0.0300,0.030,,CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...,...,Bacteria,24.0,48.0,72.0,37.0,2.0,1,0,0,0


In [21]:
df_main = df_main.drop('fold_increase_in_antibacterial_activity (%)',axis = 1)

In [22]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 535 entries, 0 to 583
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Bacteria                      535 non-null    object 
 1   Drug                          535 non-null    object 
 2   Drug_dose                     421 non-null    float64
 3   NP size_min                   535 non-null    float64
 4   NP size_max                   535 non-null    float64
 5   NP size_avg                   535 non-null    float64
 6   ZOI_drug                      535 non-null    float64
 7   ZOI_drug_NP                   535 non-null    float64
 8   smiles                        535 non-null    object 
 9   NumHDonors                    535 non-null    float64
 10  NumHAcceptors                 535 non-null    float64
 11  NumHeteroatoms                535 non-null    float64
 12  ExactMolWt                    535 non-null    float64
 13  MaxAb

In [23]:
df_main = df_main.drop(['Bacteria', 'Drug', 'smiles', 'kingdom', 'Drug_dose'], axis=1)

In [24]:
df_main

Unnamed: 0,NP size_min,NP size_max,NP size_avg,ZOI_drug,ZOI_drug_NP,NumHDonors,NumHAcceptors,NumHeteroatoms,ExactMolWt,MaxAbsPartialCharge,...,NumAromaticHeterocycles,"min_Incub_period, h","avg_Incub_period, h","max_Incub_period, h","growth_temp, C",biosafety_level,shape_spherical,shape_nanorods_and_triangles,shape_triangular,gram_p
0,4.0,10.0,8.0,17.0000,32.000,4.0,6.0,9.0,365.104542,0.507967,...,0.0,48.0,84.0,120.0,37.0,2.0,1,0,0,0
1,4.0,10.0,8.0,17.0000,32.000,1.0,5.0,8.0,372.054610,1.000000,...,0.0,48.0,84.0,120.0,37.0,2.0,1,0,0,0
2,15.0,30.0,28.0,17.0000,32.000,4.0,6.0,9.0,365.104542,0.507967,...,0.0,48.0,84.0,120.0,37.0,2.0,1,0,0,0
3,15.0,30.0,28.0,17.0000,32.000,1.0,5.0,8.0,372.054610,1.000000,...,0.0,48.0,84.0,120.0,37.0,2.0,1,0,0,0
4,5.0,30.0,21.0,0.0000,17.000,19.0,25.0,35.0,1447.430200,0.507810,...,0.0,240.0,288.0,336.0,30.0,1.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,4.0,10.0,8.0,1.0000,0.125,4.0,6.0,9.0,365.104542,0.507967,...,0.0,24.0,36.0,48.0,37.0,2.0,1,0,0,1
580,5.0,80.0,60.0,0.3125,0.100,4.0,6.0,9.0,365.104542,0.507967,...,0.0,12.0,18.0,24.0,37.0,2.0,1,0,0,0
581,4.0,10.0,8.0,0.0300,0.030,8.0,12.0,12.0,477.316249,0.387562,...,0.0,24.0,36.0,48.0,37.0,2.0,1,0,0,1
582,15.0,30.0,28.0,0.0300,0.030,8.0,12.0,12.0,477.316249,0.387562,...,0.0,24.0,48.0,72.0,37.0,2.0,1,0,0,0


In [25]:
df_main.to_csv('df_result_2.csv', index=False)