# Antibiotic Resistance Prediction

Let's start by opening the data containing the bacteria and which antibiotics they're resistant to

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lens
%matplotlib inline

dat = pd.read_csv("PATRIC_genomes_AMR.tsv", sep='\t', dtype=str)
dat

Unnamed: 0,genome_id,genome_name,taxon_id,antibiotic,resistant_phenotype,measurement,measurement_sign,measurement_value,measurement_unit,laboratory_typing_method,laboratory_typing_method_version,laboratory_typing_platform,vendor,testing_standard,testing_standard_year,source
0,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,ampicillin,,==16,==,16,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
1,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,ceftiofur,,>8,>,8,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
2,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,chlortetracycline,,==8,==,8,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
3,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,clindamycin,,>16,>,16,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
4,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,danofloxacin,,==1,==,1,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
5,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,enrofloxacin,,==1,==,1,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
6,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,florfenicol,,==8,==,8,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
7,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,gentamicin,,>16,>,16,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
8,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,neomycin,,==32,==,32,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,
9,32002.4,Achromobacter denitrificans strain USDA-ARS-US...,32002,oxytetracycline,,>8,>,8,mg/L,MIC,BOPO6F plate; cattle host,Sensititre,TREK Diagnostic Systems,CLSI,,


We only really care about the antibiotic/bacteria pairs and not how the measurements were taken, who took them, etc., so let's drop all the non-relevant columns. Also, let's drop any data points that don't list a resistant phenotype (i.e susceptible or resistant to an antibiotic).

In [2]:
orig_rows = dat.shape[0]
dat = dat[["genome_id", "genome_name", "taxon_id", "antibiotic", "resistant_phenotype"]]
dat = dat.dropna(how="any")
dropped_rows = orig_rows - dat.shape[0]
print("Dropped {} rows of the original {}".format(dropped_rows, orig_rows))
dat

Dropped 15788 rows of the original 125389


Unnamed: 0,genome_id,genome_name,taxon_id,antibiotic,resistant_phenotype
18,1310800.122,Acinetobacter baumannii 1000160,1310800,imipenem,Susceptible
19,1310784.3,Acinetobacter baumannii 1007214,1310784,carbapenem,Susceptible
20,1310784.3,Acinetobacter baumannii 1007214,1310784,imipenem,Susceptible
21,1310751.3,Acinetobacter baumannii 1022959,1310751,carbapenem,Resistant
22,1310751.3,Acinetobacter baumannii 1022959,1310751,imipenem,Resistant
23,1310586.3,Acinetobacter baumannii 1031433,1310586,carbapenem,Resistant
24,1310586.3,Acinetobacter baumannii 1031433,1310586,imipenem,Resistant
25,1310571.3,Acinetobacter baumannii 1032241,1310571,carbapenem,Resistant
26,1310571.3,Acinetobacter baumannii 1032241,1310571,imipenem,Resistant
27,1310572.3,Acinetobacter baumannii 1032359,1310572,carbapenem,Resistant


Let's do a summary of our data. Lens does a ton of upfront computation so this might take a few minutes

In [53]:
ls = lens.summarise(dat)
le = lens.explore(ls)
le.describe()

0,1,2,3,4,5
,genome_id,genome_name,taxon_id,antibiotic,resistant_phenotype
desc,,,,categorical,categorical
dtype,object,object,object,object,object
notnulls,109601,109601,109601,109601,109601
nulls,0,0,0,0,0
unique,15471,15436,3027,106,6


In [55]:
dat.resistant_phenotype.unique()

array(['Susceptible', 'Resistant', 'Intermediate', 'Non-susceptible',
       'Not defined', 'RS'], dtype=object)

Now lets download the protein coding sequences for all the genomes. This is about 55 GB, it takes hours to download

In [None]:
import ftplib
import os
for genome_id in dat.genome_id.unique():
    file_nm = genome_id + '.PATRIC.ffn'
    if not os.path.isfile('./sequences/' + file_nm):
        conn = ftplib.FTP('ftp.patricbrc.org')
        conn.login()
        conn.cwd('/patric2/genomes/' + genome_id + '/')
        conn.retrbinary('RETR ' + file_nm, open('./sequences/' + file_nm, 'wb').write)
        conn.quit()