In [1]:
import pandas as pd

import requests
import numpy as np
import time
import re

In [2]:
#read bacteria abundance data for all three diseases and create a single dataframe

supp_12_17_excel = "../Priya_et_al_Supplementary_Tables/Supplementary Tables S12-S17.xlsx"
supp_1_excel = "../Priya_et_al_Supplementary_Tables/Supplementary Table S1.xlsx"

crc_bacteria = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S13')
crc_bacteria.rename(columns={'Unnamed: 0': 'bacteria'}, inplace=True)

ibs_bacteria = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S17')
ibs_bacteria.rename(columns={'Unnamed: 0': 'bacteria'}, inplace=True)

ibd_bacteria = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S15')
ibd_bacteria.drop("Unnamed: 0", axis=1, inplace=True)

c1 = pd.DataFrame({'bacteria':crc_bacteria['bacteria'], 'disease': 'CRC'})
c2 = pd.DataFrame({'bacteria':ibd_bacteria['taxonomy'], 'disease': 'IBD'})
c2.drop_duplicates(subset=['bacteria'],inplace=True)
c3 = pd.DataFrame({'bacteria':ibs_bacteria['bacteria'], 'disease': 'IBS'})
bacteria = pd.concat([c1,c2,c3])
bacteria.head()

Unnamed: 0,bacteria,disease
0,Bacteria;Firmicutes;Clostridia;Clostridiales;R...,CRC
1,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,CRC
2,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,CRC
3,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,CRC
4,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales,CRC


In [3]:
#aggregate bacteria if it occurs across multiple diseases

bacteria = bacteria.groupby(['bacteria']).agg({'disease': ', '.join})
bacteria = bacteria.reset_index()

# add the number of levels in bacteria's taxonomy
bacteria['levels'] = bacteria['bacteria'].apply(lambda x: len(x.split(';')))
bacteria.head()

Unnamed: 0,bacteria,disease,levels
0,Bacteria,"CRC, IBS",1
1,Bacteria;AC1;B04R032,CRC,3
2,Bacteria;AC1;HDBW-WB69,CRC,3
3,Bacteria;AD3;ABS-6,CRC,3
4,Bacteria;AD3;JG37-AG-4,CRC,3


In [4]:
#sets last level of taxonomy if level > 6
def resetIndex(bacteria):
    taxonomy = bacteria.split(';')
    return bacteria if len(taxonomy) < 6 else taxonomy[-1]

In [5]:
#combine meta data and bacteria data
crc_metadata = pd.read_excel(supp_1_excel,engine='openpyxl',sheet_name='CRC_metadata',index_col=0)
crc_metadata.drop(columns=['MSI_status', 'SampleID', 'Patient_Blind_ID'], inplace=True)
crc_metadata.rename(columns={'Sex': 'Gender', 'Description': 'Diagnosis'}, inplace=True)
crc_metadata.index.name = 'Patient ID'

#commented this because CRC bacteria has same last level of taxonomy for multiple bacteria
# crc_bacteria['bacteria'] = crc_bacteria['bacteria'].apply(resetIndex)
crc_bacteria.set_index('bacteria', inplace=True)
crc_bacteria.index.name = ""
crc_bacteria = crc_bacteria.transpose()
crc_bacteria.index.name = "Patient ID"
crc_bacteria['Disease'] = 'CRC'

crc_combined = crc_metadata.join(crc_bacteria, on='Patient ID')
crc_combined.head()

Unnamed: 0_level_0,Diagnosis,Gender,Site,Stage,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;uniformis,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Porphyromonadaceae;Parabacteroides;distasonis,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;[Barnesiellaceae],...,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Dermabacteraceae,Bacteria;FCPU426,Bacteria;Chloroflexi;P2-11E,Bacteria;Proteobacteria;Betaproteobacteria;Neisseriales;Neisseriaceae;Kingella,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Mycobacteriaceae;Mycobacterium;arupense,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Gallibacterium;genomosp.,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Mycobacteriaceae;Mycobacterium;celatum,Bacteria;Spirochaetes;GN05;LF030,Bacteria;Lentisphaerae;[Lentisphaeria];Z20;R4-45B,Disease
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B01,tumor,male,Sigmoid_colon,3,116,86,0,141,63,0,...,0,0,0,0,0,0,0,0,0,CRC
B02,normal,male,Sigmoid_colon,3,66,186,0,241,103,3,...,0,0,0,0,0,0,0,0,0,CRC
B03,tumor,female,Descending_colon,2,0,7,22,0,67,0,...,0,0,0,0,0,0,0,0,0,CRC
B04,normal,female,Descending_colon,2,14,37,0,32,1,1,...,0,0,0,0,0,0,0,0,0,CRC
B05,tumor,male,Right_colon,na,265,0,0,7,137,0,...,0,0,0,0,0,0,0,0,0,CRC


In [6]:
#combine meta data and bacteria data
ibs_metadata = pd.read_excel(supp_1_excel,engine='openpyxl',sheet_name='IBS_metadata',index_col=0)
ibs_metadata.drop_duplicates(subset=['Subject_ID'],inplace=True)
ibs_metadata.set_index('Subject_ID', inplace=True)
ibs_metadata.index.name = 'Patient ID'
ibs_metadata.rename(columns = {'Cohort': 'Diagnosis'}, inplace=True)
ibs_metadata.drop(columns=['Timepoint'], inplace=True)

ibs_bacteria['bacteria'] = ibs_bacteria['bacteria'].apply(resetIndex)
ibs_bacteria.set_index('bacteria', inplace=True)
ibs_bacteria.index.name = ""
ibs_bacteria = ibs_bacteria.transpose()
ibs_bacteria.index.name = 'Patient ID'
ibs_bacteria['Disease'] = 'IBS'

ibs_combined = ibs_metadata.join(ibs_bacteria, on='Patient ID')
ibs_combined.head()

Unnamed: 0_level_0,Diagnosis,Gender,Faecalibacterium,Blautia,Prevotella copri,Bacteroides coprocola DSM 17136,Bacteroides,Bacteria,Collinsella aerofaciens,Bifidobacterium,...,Treponema socranskii,Cloacibacillus evryensis,Cloacibacillus,Fretibacterium,Rarimicrobium hominis,Rarimicrobium,Mycoplasma canis,Defluviitoga tunisiensis,Chthoniobacter,Disease
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10007541,Healthy,F,84.0,6700.5,0.0,0.0,1935.5,5.5,0.5,269.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IBS
10007542,IBS-C,F,16.0,4956.0,1.0,0.0,45.0,2.0,1720.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IBS
10007544,IBS-D,M,240.0,4054.5,1599.5,0.0,280.0,2.0,0.5,724.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IBS
10007547,IBS-C,F,121.0,1247.0,2527.5,0.0,41.0,0.0,1814.0,1135.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IBS
10007548,IBS-C,F,2869.5,1466.0,0.5,0.0,857.0,37.5,714.5,1598.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IBS


In [7]:
#combine meta data and bacteria data
ibd_metadata = pd.read_excel(supp_1_excel,engine='openpyxl',sheet_name='IBD_metadata', index_col=0)
ibd_metadata.drop(columns=['External_ID'], inplace=True)
ibd_metadata.rename(columns = {'Biopsy_location': 'Site'}, inplace=True)
ibd_metadata.index.name = 'Patient ID'

ibd_bacteria['taxonomy'] = ibd_bacteria['taxonomy'].apply(resetIndex)
ibd_bacteria.set_index("taxonomy", inplace=True)
ibd_bacteria.index.name = ""
ibd_bacteria = ibd_bacteria.transpose()
ibd_bacteria = ibd_bacteria.groupby(by=ibd_bacteria.columns, axis=1).sum()
ibd_bacteria.index.name = "Patient ID"
ibd_bacteria['Disease'] = 'IBD'
ibd_combined = ibd_metadata.join(ibd_bacteria, on='Patient ID')
ibd_combined.head()

  ibd_bacteria['Disease'] = 'IBD'


Unnamed: 0_level_0,Site,Diagnosis,Gender,Abiotrophia,Acetanaerobacterium,Acetobacterium,Acidaminococcus,Acidovorax,Acinetobacter,Actinomyces,...,[Eubacterium] hallii group,[Eubacterium] nodatum group,[Eubacterium] rectale group,[Eubacterium] ventriosum group,[Eubacterium] xylanophilum group,[Eubacterium] yurii group,[Ruminococcus] gauvreauii group,[Ruminococcus] gnavus group,[Ruminococcus] torques group,Disease
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3002,Sigmoid Colon,CD,Female,0,0,0,4,0,2,1,...,0,0,871,42,26,0,54,0,0,IBD
C3003,Sigmoid Colon,UC,Female,0,0,0,0,0,3,4,...,46,0,1266,4,0,0,0,154,0,IBD
C3004,Sigmoid Colon,UC,Female,0,1,0,0,0,1,0,...,62,0,619,15,336,0,1,183,0,IBD
C3005,Rectum,UC,Female,2,0,0,0,0,3,4,...,408,0,673,0,0,0,0,87,0,IBD
C3006,Rectum,UC,Male,0,0,0,0,0,1,5,...,124,0,778,6,30,0,0,101,0,IBD


In [8]:
#create superset of data where columns are the list of entire bacterial data we have
combined_diseases = pd.concat([ibs_combined, crc_combined, ibd_combined])
combined_diseases.head()

Unnamed: 0_level_0,Diagnosis,Gender,Faecalibacterium,Blautia,Prevotella copri,Bacteroides coprocola DSM 17136,Bacteroides,Bacteria,Collinsella aerofaciens,Bifidobacterium,...,[Eubacterium] eligens group,[Eubacterium] hallii group,[Eubacterium] nodatum group,[Eubacterium] rectale group,[Eubacterium] ventriosum group,[Eubacterium] xylanophilum group,[Eubacterium] yurii group,[Ruminococcus] gauvreauii group,[Ruminococcus] gnavus group,[Ruminococcus] torques group
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10007541,Healthy,F,84.0,6700.5,0.0,0.0,1935.5,5.5,0.5,269.0,...,,,,,,,,,,
10007542,IBS-C,F,16.0,4956.0,1.0,0.0,45.0,2.0,1720.0,0.0,...,,,,,,,,,,
10007544,IBS-D,M,240.0,4054.5,1599.5,0.0,280.0,2.0,0.5,724.0,...,,,,,,,,,,
10007547,IBS-C,F,121.0,1247.0,2527.5,0.0,41.0,0.0,1814.0,1135.0,...,,,,,,,,,,
10007548,IBS-C,F,2869.5,1466.0,0.5,0.0,857.0,37.5,714.5,1598.0,...,,,,,,,,,,


In [9]:
#create superset of human genes for all three diseases

crc_genes = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S12', dtype=str)
crc_genes.rename(columns={'Unnamed: 0': 'genes'}, inplace=True)
crc_genes.drop_duplicates(subset=['genes'],inplace=True)
crc_genes.set_index('genes', inplace=True)
crc_genes.index.name = ""
crc_genes = crc_genes.transpose()
crc_genes['Disease'] = 'CRC'

ibd_genes = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S14')
ibd_genes.rename(columns={'Unnamed: 0': 'genes'}, inplace=True)
ibd_genes.drop_duplicates(subset=['genes'],inplace=True)
ibd_genes.set_index('genes', inplace=True)
ibd_genes.index.name = ""
ibd_genes = ibd_genes.transpose()
ibd_genes['Disease'] = 'IBD'

ibs_genes = pd.read_excel(supp_12_17_excel,engine='openpyxl',sheet_name='S16')
ibs_genes.rename(columns={'Unnamed: 0': 'genes'}, inplace=True)
ibs_genes.drop_duplicates(subset=['genes'],inplace=True)
ibs_genes.set_index('genes', inplace=True)
ibs_genes.index.name = ""
ibs_genes = ibs_genes.transpose()
ibs_genes['Disease'] = 'IBS'

crc_genes_combined = crc_metadata.join(crc_genes)
ibs_genes_combined = ibs_metadata.join(ibs_genes)
ibd_genes_combined = ibd_metadata.join(ibd_genes)

combined_genes = pd.concat([crc_genes_combined, ibs_genes_combined, ibd_genes_combined])
combined_genes.head()

Unnamed: 0_level_0,Diagnosis,Gender,Site,Stage,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,...,2022-09-12 00:00:00,2022-09-14 00:00:00,2022-09-02 00:00:00,2022-09-03 00:00:00,2022-09-04 00:00:00,2022-09-05 00:00:00,2022-09-06 00:00:00,2022-09-07 00:00:00,2022-09-08 00:00:00,2022-09-09 00:00:00
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B01,tumor,male,Sigmoid_colon,3,235,23,184,263,98,111,...,,,,,,,,,,
B02,normal,male,Sigmoid_colon,3,815,15,166,389,73,144,...,,,,,,,,,,
B03,tumor,female,Descending_colon,2,351,188,146,185,69,370,...,,,,,,,,,,
B04,normal,female,Descending_colon,2,629,12,124,231,70,53,...,,,,,,,,,,
B05,tumor,male,Right_colon,na,91,4,150,480,457,287,...,,,,,,,,,,


In [10]:
def getTaxonId(bacteria):
    global count;
    global error;
    
    count = count + 1
    taxonomy = bacteria.split(";")
    last = taxonomy[-1].split(" ");
    if (len(last) > 1):
        search_term = "+".join(last);
    else:
        search_term = "+".join([*taxonomy[-2].split(" "), *last])
    
    search_term = re.sub("[\[\]]", "", search_term)
    url = f'https://rest.uniprot.org/taxonomy/stream?fields=id&format=tsv&query=({search_term})'
    
    try: 
        response = requests.get(url)
        print(f'processing..{count} - search term: {search_term}')

        if (response.status_code == 200):
#             print(response.text.split("\n")[1:-1])
            return response.text.split("\n")[1:-1]
        else:
            print(f'{response.status_code} for {search_term}!')
            error.append(bacteria)
            return 'NA'
    except Exception as e:
        print(f'error occurred for {search_term}!, {e}')
        error.append(bacteria)

In [11]:
#bacteria for which we will be getting the genes. Currently the level is set to 6 
#NOTE: IBD bacteria are having levels <= 6

higher_level_bacteria = bacteria[bacteria['levels'] > 6]
count = 0;
error = []
start = time.time()

#first get the taxon IDs for each bacteria
higher_level_bacteria['taxon id'] = higher_level_bacteria['bacteria'].map(getTaxonId)

totalTime = time.time()-start
print(f'Time taken for {len(higher_level_bacteria)} bacteria: {totalTime} seconds')

processing..1 - search term: Actinomyces+bowdenii
processing..2 - search term: Actinomyces+canis
processing..3 - search term: Actinomyces+cardiffensis
processing..4 - search term: Actinomyces+dentalis
processing..5 - search term: Actinomyces+georgiae
processing..6 - search term: Actinomyces+graevenitzii
processing..7 - search term: Actinomyces+israelii
processing..8 - search term: Actinomyces+johnsonii
processing..9 - search term: Actinomyces+massiliensis+4401292
processing..10 - search term: Actinomyces+meyeri
processing..11 - search term: Actinomyces+odontolyticus
processing..12 - search term: Actinomyces+polynesiensis
processing..13 - search term: Actinomyces+turicensis
processing..14 - search term: Actinomyces+europaeus
processing..15 - search term: Actinomyces+hyovaginalis
processing..16 - search term: Mobiluncus+curtisii
processing..17 - search term: Mobiluncus+mulieris
processing..18 - search term: Corynebacterium+kroppenstedtii
processing..19 - search term: Corynebacterium+lubr

processing..149 - search term: Cryptobacterium+curtum
processing..150 - search term: Eggerthella+lenta
processing..151 - search term: Gordonibacter+urolithinfaciens
processing..152 - search term: Raoultibacter+massiliensis
processing..153 - search term: Slackia+exigua+ATCC+700122
processing..154 - search term: Slackia+faecicanis
processing..155 - search term: Slackia+isoflavoniconvertens
processing..156 - search term: Slackia+piriformis
processing..157 - search term: Slackia+piriformis+YIT+12062
processing..158 - search term: Hydrogenobacter+hydrogenophilus
processing..159 - search term: Thermocrinis+ruber
processing..160 - search term: Persephonella+hydrogeniphila
processing..161 - search term: Bacteroides+caccae
processing..162 - search term: Bacteroides+caecimuris
processing..163 - search term: Bacteroides+clarus
processing..164 - search term: Bacteroides+clarus+YIT+12056
processing..165 - search term: Bacteroides+coprocola
processing..166 - search term: Bacteroides+coprocola+DSM+17

processing..296 - search term: Capnocytophaga+granulosa+ATCC+51502
processing..297 - search term: Capnocytophaga+haemolytica
processing..298 - search term: Capnocytophaga+leadbetteri
processing..299 - search term: Capnocytophaga+ochracea
processing..300 - search term: Capnocytophaga+sputigena
processing..301 - search term: Capnocytophaga+sputigena+ATCC+33612
processing..302 - search term: Capnocytophaga+ochracea
processing..303 - search term: Chryseobacterium+jeonii
processing..304 - search term: Flavobacterium+caeni
processing..305 - search term: Elizabethkingia+meningoseptica
processing..306 - search term: Sphingobacterium+hotanense
processing..307 - search term: Sphingobacterium+multivorum
processing..308 - search term: Anaerolinea+thermophila
processing..309 - search term: Ardenscatena+maritimus
processing..310 - search term: Oscillochloris+trichoides
processing..311 - search term: Geovibrio+thiophilus
processing..312 - search term: Mucispirillum+schaedleri
processing..313 - search

processing..442 - search term: Catabacter+hongkongensis
processing..443 - search term: Christensenella+minuta
processing..444 - search term: Christensenella+timonensis
processing..445 - search term: Butyricicoccus+desmolans
processing..446 - search term: Butyricicoccus+pullicaecorum
processing..447 - search term: Clostridium+aurantibutyricum
processing..448 - search term: Clostridium+bornimense
processing..449 - search term: Clostridium+butyricum
processing..450 - search term: Clostridium+chartatabidum
processing..451 - search term: Clostridium+paraputrificum
processing..452 - search term: Clostridium+perfringens
processing..453 - search term: Clostridium+perfringens+ATCC+13124
processing..454 - search term: Clostridium+saccharobutylicum
processing..455 - search term: Clostridium+septicum
processing..456 - search term: Clostridium+tertium
processing..457 - search term: Clostridium+ventriculi
processing..458 - search term: Clostridium+acetobutylicum
processing..459 - search term: Clostr

processing..591 - search term: Clostridioides+difficile+ATCC+9689+=+DSM+1296
processing..592 - search term: Filifactor+alocis
processing..593 - search term: Intestinibacter+bartlettii
processing..594 - search term: Eubacterium+tenue
processing..595 - search term: Peptoanaerobacter+stomatis
processing..596 - search term: Eubacterium+yurii
processing..597 - search term: Peptostreptococcus+anaerobius
processing..598 - search term: Peptostreptococcus+stomatis
processing..599 - search term: Peptostreptococcus+anaerobius
processing..600 - search term: Romboutsia+sedimentorum
processing..601 - search term: Romboutsia+timonensis
processing..602 - search term: Terrisporobacter+petrolearius
processing..603 - search term: Clostridium+difficile
processing..604 - search term: Acutalibacter+muris
processing..605 - search term: Anaeromassilibacillus+senegalensis
processing..606 - search term: Anaerotruncus+colihominis
processing..607 - search term: Anaerotruncus+rubiinfantis
processing..608 - search 

processing..735 - search term: Leptotrichia+hofstadii
processing..736 - search term: Leptotrichia+trevisanii
processing..737 - search term: Leptotrichia+wadei
processing..738 - search term: Sebaldella+termitidis
processing..739 - search term: Victivallis+vadensis
processing..740 - search term: Victivallis+vadensis+ATCC+BAA-548
processing..741 - search term: Victivallis+vadensis
processing..742 - search term: Acanthamoeba+castellanii
processing..743 - search term: Acanthamoeba+healyi
processing..744 - search term: Acanthamoeba+polyphaga
processing..745 - search term: Angiopteris+lygodiifolia
processing..746 - search term: Calycanthus+floridus
processing..747 - search term: Carludovica+palmata
processing..748 - search term: Desmarestia+viridis
processing..749 - search term: Didymeles+perrieri
processing..750 - search term: Galbulimima+belgraveana
processing..751 - search term: Lepidoceras+chilense
processing..752 - search term: Lupinus+luteus
processing..753 - search term: Myristica+frag

processing..887 - search term: Acinetobacter+lwoffii
processing..888 - search term: Acinetobacter+rhizosphaerae
processing..889 - search term: Acinetobacter+schindleri
processing..890 - search term: Agitococcus+lubricus
processing..891 - search term: Enhydrobacter+aerosaccus
processing..892 - search term: Moraxella+cuniculi
processing..893 - search term: Moraxella+osloensis
processing..894 - search term: Psychrobacter+marincola
processing..895 - search term: Psychrobacter+pacificensis
processing..896 - search term: Psychrobacter+pulmonis
processing..897 - search term: Pseudomonas+aeruginosa
processing..898 - search term: Pseudomonas+alcaligenes
processing..899 - search term: Pseudomonas+balearica
processing..900 - search term: Pseudomonas+citronellolis
processing..901 - search term: Pseudomonas+fragi
processing..902 - search term: Pseudomonas+mendocina
processing..903 - search term: Pseudomonas+nitroreducens
processing..904 - search term: Pseudomonas+pseudoalcaligenes
processing..905 -

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  higher_level_bacteria['taxon id'] = higher_level_bacteria['bacteria'].map(getTaxonId)


In [14]:
# save to excel!
higher_level_bacteria.to_excel("../data/taxonId.xlsx")

In [21]:
#get genes based on taxon ID
def getGenes(taxonIds):
    global count;
    global error;
    count = count + 1
    
    # iterate on taxon ID until we get some data.
    # POSSIBLE IMPROVEMENT: iterate until we get enough data (more genes insetad of a few)
    for taxonId in taxonIds:
        url = f'https://rest.uniprot.org/uniprotkb/stream?fields=gene_names&format=tsv&query=((organism_id:{taxonId}))'
        response = requests.get(url)

        try:
            response = requests.get(url)
            print(f'processing..{count} - for taxon ID: {taxonId}')

            if (response.status_code == 200):
                # avoid buffer characters like quotes
                genes = response.text.split("\n")[1:-1]
                
                if (len(genes) > 0):
                    db1 = []
                    db2 = []
                    for x in genes:
                        sublist = x.split(" ")
                        for y in sublist:
                            if (y.strip() != ''):
                                if ('_' in y):
                                    db2.append(y)
                                else:
                                    db1.append(y)

                    return [db1, db2]
            else:
                print(f'{response.status_code} for {taxonId}!')
        except requests.exceptions.ChunkedEncodingError:
            print('Chunk encoding error!')
        except:
            print(f'error occurred for {taxonId}!')
    return [[], []]

In [None]:
# get genes for each bacteria based on taxon ID. The first taxon ID which yeilds results is used.
# As uniprot has different DBs from which genes are pulled in, 
# we create 2 columns - one for gene names having '_' and second for others

count = 0;
error = []
start = time.time()
higher_level_bacteria['DB1 genes'], higher_level_bacteria['DB2 genes'] = \
zip(*higher_level_bacteria['taxon id'].map(getGenes))
print(f'Time taken for {len(higher_level_bacteria)} bacteria: ', time.time()-start)

processing..1 - for taxon ID: 131109
processing..2 - for taxon ID: 100469
processing..3 - for taxon ID: 888050
processing..4 - for taxon ID: 272548
processing..4 - for taxon ID: 1120941
processing..5 - for taxon ID: 1125717
processing..6 - for taxon ID: 55565
processing..7 - for taxon ID: 1659
processing..8 - for taxon ID: 544581
processing..9 - for taxon ID: 1167628
processing..9 - for taxon ID: 461393
processing..10 - for taxon ID: 52773
processing..11 - for taxon ID: 562972
processing..11 - for taxon ID: 649742
processing..12 - for taxon ID: 1325934
processing..13 - for taxon ID: 883077
processing..14 - for taxon ID: 883069
processing..15 - for taxon ID: 29316
processing..16 - for taxon ID: 585198
processing..16 - for taxon ID: 2051
processing..17 - for taxon ID: 2052
processing..18 - for taxon ID: 645127
processing..19 - for taxon ID: 541095
processing..19 - for taxon ID: 1121363
processing..20 - for taxon ID: 47846
processing..21 - for taxon ID: 499555
processing..22 - for taxon I

processing..185 - for taxon ID: 449673
processing..186 - for taxon ID: 997889
processing..186 - for taxon ID: 997890
processing..186 - for taxon ID: 820
processing..187 - for taxon ID: 702446
processing..188 - for taxon ID: 435590
processing..189 - for taxon ID: 997892
processing..190 - for taxon ID: 657309
processing..191 - for taxon ID: 384636
processing..192 - for taxon ID: 85831
processing..193 - for taxon ID: 997873
processing..194 - for taxon ID: 483216
processing..195 - for taxon ID: 295405
processing..196 - for taxon ID: 702444
processing..196 - for taxon ID: 997885
processing..196 - for taxon ID: 997886
processing..197 - for taxon ID: 484018
processing..198 - for taxon ID: 997889
processing..198 - for taxon ID: 997890
processing..198 - for taxon ID: 820
processing..199 - for taxon ID: 742726
processing..200 - for taxon ID: 742726
processing..201 - for taxon ID: 880074
processing..202 - for taxon ID: 1349822
processing..203 - for taxon ID: 1349822
processing..204 - for taxon ID

In [None]:
#clean df to remove empty lists

def remove_empty(geneList):
    if '' in geneList: 
        geneList.remove('')
        
higher_level_bacteria['DB1 genes'].apply(remove_empty)
higher_level_bacteria['DB2 genes'].apply(remove_empty)
higher_level_bacteria.to_csv("../data/bacteria_genes.csv")

In [None]:
# find common genes between each pair of bacteria
# for each bacteria, common gene with another bacteria is stored as a dictionary with key as 'index' for 
# the other bacteria, and values having the common genes

length = len(higher_level_bacteria)
higher_level_bacteria['gene subset DB1'] = [{} for _ in range(len(higher_level_bacteria))]
higher_level_bacteria['gene subset DB2'] = [{} for _ in range(len(higher_level_bacteria))]

DB1_genes = higher_level_bacteria['DB1 genes']
DB2_genes = higher_level_bacteria['DB2 genes']
start = time.time()

for i in range(length):
    print('working on row ', i)
    for j in range(i+1, length):
        db1_common = set(DB1_genes.iloc[i]).intersection(DB1_genes.iloc[j])
        db2_common = set(DB2_genes.iloc[i]).intersection(DB2_genes.iloc[j])
        
        if (len(db1_common) != 0):
            current_dict_i = higher_level_bacteria['gene subset DB1'].iloc[i]
            current_dict_i[str(j)] = db1_common

            current_dict_j = higher_level_bacteria['gene subset DB1'].iloc[j]
            current_dict_j[str(i)] = db1_common
        
        if (len(db2_common) != 0):
            current_dict_i = higher_level_bacteria['gene subset DB2'].iloc[i]
            current_dict_i[str(j)] = db2_common

            current_dict_j = higher_level_bacteria['gene subset DB2'].iloc[j]
            current_dict_j[str(i)] = db2_common

print('time taken', time.time() - start)          

In [None]:
# Add length of common genes for each pair of bacteria
#IMPROVEMENT: This can also be viewed as an N*N matrix

higher_level_bacteria['DB1 length'] = higher_level_bacteria['DB1 genes'].apply(lambda x: len(x))
higher_level_bacteria['DB1 subset length'] = higher_level_bacteria['gene subset DB1'].apply(lambda x: len(x))

In [None]:
higher_level_bacteria.to_csv("../data/bacteria_genes.csv")

In [None]:
higher_level_bacteria.head()