### **Data Extraction and Cleaning**

In [None]:
# Function to Clean ATLAS Zipped file and save as dataframe (ensure to add .csv extension to save_cleaned_file_path e.g dir/cleaned.csv)
def Clean_ATLAS(zipped_file_path, save_cleaned_file_path):
  from zipfile import ZipFile
  zf=ZipFile(zipped_file_path, 'r')
  zf.extractall()
  zf.close()

  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  df=pd.read_csv('/content/2023_06_15 atlas_antibiotics.csv')

  new_list=[]
  for i in range(df.shape[0]):
    new_list.append(df.loc[i,'Amikacin': 'Meropenem vaborbactam_I'].dropna().to_dict())

  df=df.loc[:, :'Phenotype']
  df['Antibiotics']= new_list

  big_list=[]
  for i in range(df.shape[0]):
    for row in df['Antibiotics'][i].keys():
      if row.endswith('_I'):
        x=row.split('_')[0]
        big_list.append([i, x, df['Antibiotics'][i][row], df['Antibiotics'][i][x] ])

  Antibiotics=pd.DataFrame(big_list)

  clean_df=df.merge(Antibiotics, left_index=True, right_on=0, how='left').drop(['Antibiotics', 0], axis=1).rename(columns={1:"Antibiotics", 2:"Status", 3:"COncentration"})

  clean_df.to_csv(save_cleaned_file_path, sep=',', index=False)

In [None]:
# Extract file from zipped file
Clean_ATLAS('/content/drive/MyDrive/2023_06_15 atlas_antibiotics.zip', '/content/file.csv')

### **Machine Learning**

In [None]:
# Load libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm

In [None]:
# Load Dataset
df=pd.read_csv('file.csv')

In [None]:
# Check Dataset Shape
df.shape

(9752774, 16)

In [None]:
# Drop Unnecessary Columns
df.drop(['Isolate Id','Study', 'State', 'Phenotype','COncentration'], axis=1, inplace=True)

In [None]:
# Fill Missing Values
df['Gender'].fillna('Male', inplace=True)
df['In / Out Patient'].fillna('Inpatient', inplace=True)
df['In / Out Patient']=df['In / Out Patient'].str.replace('None Given', 'Other')

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)

In [None]:
# Rename column
df.rename(columns={'In / Out Patient': 'Patient'}, inplace=True)

In [None]:
# Label encode Categorical Columns

cols=['Species', 'Family', 'Country', 'Gender', 'Age Group', 'Speciality',
       'Source', 'Patient', 'Antibiotics']

from sklearn import preprocessing
import pickle
for i in cols:
  le = preprocessing.LabelEncoder()
  df[i]=le.fit_transform(df[i])
  name=i+'.pkl'
  output = open(name, 'wb')
  pickle.dump(le, output)
  output.close()

In [None]:
# Encode Target Columm

df['Status'].replace('Susceptible', 0, inplace=True)
df['Status'].replace('Resistant', 1, inplace=True)
df['Status'].replace('Intermediate', 2, inplace=True)

In [None]:
# Light GBM Model

X=df.drop('Status', axis=1)
Y=df['Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.3, random_state=42)
import lightgbm
r= lightgbm.LGBMClassifier()
r.fit(X_train, y_train)
r.score(X_test, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.487945 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 6824657, number of used features: 10
[LightGBM] [Info] Start training from score -0.281570
[LightGBM] [Info] Start training from score -1.682200
[LightGBM] [Info] Start training from score -2.822834


0.835948392637718

In [None]:
# Save Model
r.booster_.save_model('lgbr_base.txt')