In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### If url is unavailable data is also available from HMP/T2D data portal, downloaded as metadata file for each patient ID listed below

In [3]:
patient_data = pd.read_html("http://hmp2-data.stanford.edu/script.php?table=subject")
patient_data = pd.DataFrame(patient_data[0])
patient_data = patient_data.rename(columns={0:'SubjectID', 1:'Study', 2:'Race', 3:'Sex', 4:'Age', 5:'BMI', 6:'SSPG', 7:'IR_IS_classification'})

In [4]:
patient_data

Unnamed: 0,SubjectID,Study,Race,Sex,Age,BMI,SSPG,IR_IS_classification
0,ZIS22OE,HMP,C,F,,,,Unknown
1,ZJBOZ2X,Exercise,C,F,65.30,19.82,,Unknown
2,ZJOSZHK,HMP,C,M,41.43,19.42,,Unknown
3,ZJTKAE3,HMP,C,F,58.65,31.24,162.00,IR
4,ZJXC41N,"HMP, Exercise",B,F,49.69,28.24,75.00,IS
5,ZK01UI6,HMP,A,M,57.78,,,Unknown
6,ZK112BX,"HMP, Exercise",A,F,57.72,28.89,234.00,IR
7,ZK4CK8Y,"HMP, Exercise",C,F,50.02,26.94,40.00,IS
8,ZKFV71L,"HMP, Exercise",B,F,66.08,32.54,65.00,IS
9,ZKVR426,HMP,C,F,61.95,30.59,174.00,IR


### This section adds results from individual patient clustering done on 5 clusters, not included in final analysis of results

In [None]:
female = pd.DataFrame(np.array([[8221, 9301, 2300, 8004, 1553], [7541, 2688, 6529, 4827, 6744], 
                       [11689, 2994, 5769, 1680, 353], [2608, 1738, 3335, 3886, 2254]]))

obese_f = pd.DataFrame(np.array([[5347, 5457, 2050, 2094, 1451], [4253, 3916, 7389, 5066, 7425], 
                                [8237, 7323, 4157, 5953, 4393], [6197, 6958, 8856, 2353, 8817]]))

non_f = pd.DataFrame(np.array([[3465, 3393, 3364, 4029, 2042], [2910, 4990, 2221, 1812, 4974],
                               [4220, 4022, 3111, 565, 3323], [7885, 1541, 7416, 3185, 1303]]))

male = pd.DataFrame(np.array([[8037, 6233, 3503, 4034, 3193],[2346, 2077, 5607, 10518, 4452],
                                [6370, 3116, 10466, 3103, 1945],[6047, 2258, 6191, 7480, 3024]]))

race = pd.DataFrame(np.array([[4600, 2048, 1932, 5491, 4144],[6767, 6006, 1326, 3763, 1339],
                                [6088, 7673, 2193, 553, 3493],[5419, 4044, 5818, 806, 3330]]))

In [6]:
full_data = pd.concat([female, obese_f, non_f, male, race], axis=0)
full_data = full_data.rename({0:'cluster 1', 1:'cluster 2', 2:'cluster 3', 3:'cluster 4', 4:'cluster 5'}, axis=1)
full_data['SubjectID'] = ['ZY7IW45','ZPEL6L3','ZWHMV5E','ZRLZ98T','ZPXU188','ZYXQKWY','ZQNZSQF','ZY39SN0','ZUF48YS','ZYHHR4Z','ZQMVU4Q','ZK4CK8Y','ZOZOW1T','ZTTL3YH','ZUL3S6O','ZTXY83G','ZN9YTFN','ZLZQMEV','ZLPZS0H','ZLYEM2C']

In [7]:
viz_data = full_data.join(patient_data.set_index('SubjectID'), on='SubjectID').reset_index().drop('index', axis=1)

In [8]:
viz_data = viz_data.drop('SSPG', axis=1)

In [9]:
c1_sp = ['NA','NA', 'Bacteroides', 'NA', 'Dysosmobacter welbionis/Oscillospiraceae', 'Bacteroides', 'NA', 'NA', 'NA', 'Ruminococcus', 'Bacteroides', 'NA', 'Roseburia', 'Bacteroides', 'NA', 'NA', 'NA', 'NA', 'Lachnospiraceae', 'Bacteroides']
c2_sp = ['Bacteroides','NA', 'Ruminococcus', 'Ruminococcus', 'NA', 'Faecalibacterium prausnitzii', 'NA', 'NA', 'Bacteroides', 'NA', 'Clostridium', 'NA', 'NA', 'Bacteroides', 'NA', 'NA', 'Bacteroides uniformis', 'Faecalibacterium', 'Ruminococcus faecis', 'NA']
c3_sp = ['NA','Bacteroides', 'NA', 'Firmicutes', 'NA', 'Bacteroides', 'Bacteroides', 'Bacteroides', 'NA', 'Alistipes putredinis', 'Flavonifractor plautii', 'NA', 'NA', 'Bacteroides', 'NA', 'Phascolarctobacterium faecium', 'Firmicutes', 'NA', 'NA', 'NA']
c4_sp = ['Eubacterium','NA', 'Roseburia intestinalis', 'Alistipes/Rikenellaceae', 'NA', 'NA', 'NA', 'NA', 'Bacteroides', 'Ruthenibacterium lactatiformans', 'Akkermansia muciniphila', 'NA', 'NA', 'NA', 'Bacteroides', 'NA', 'NA', 'Ruminococcus', 'Anaerostipes hadrus/Eubacterium', 'Prevotellamassilia timonensis']
c5_sp = ['Alistipes putredinis','NA', 'Alistipes putredinis', 'Alistipes shahii', 'NA', 'Bacteroides', 'Blautia', 'Clostridium/Lachnospiraceae', 'Ruminococcus', 'Bacteroides', 'Bacteroides uniformis', 'NA', 'Bacteroides', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'Eubacterium']

In [10]:
viz_data['C1 species'] = c1_sp
viz_data['C2 species'] = c2_sp
viz_data['C3 species'] = c3_sp
viz_data['C4 species'] = c4_sp
viz_data['C5 species'] = c5_sp

In [11]:
viz_data.sort_values(by='IR_IS_classification')

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,SubjectID,Study,Race,Sex,Age,BMI,IR_IS_classification,C1 species,C2 species,C3 species,C4 species,C5 species
0,8221,9301,2300,8004,1553,ZY7IW45,"HMP, Exercise",C,F,69.0,36.36,IR,,Bacteroides,,Eubacterium,Alistipes putredinis
17,6767,6006,1326,3763,1339,ZLZQMEV,"HMP, Exercise",H,F,61.0,29.99,IR,,Faecalibacterium,,Ruminococcus,
2,11689,2994,5769,1680,353,ZWHMV5E,HMP,C,F,62.8,33.96,IR,Bacteroides,Ruminococcus,,Roseburia intestinalis,Alistipes putredinis
15,6047,2258,6191,7480,3024,ZTXY83G,HMP,C,M,54.37,33.46,IR,,,Phascolarctobacterium faecium,,
5,4253,3916,7389,5066,7425,ZYXQKWY,HMP,C,F,50.7,32.59,IR,Bacteroides,Faecalibacterium prausnitzii,Bacteroides,,Bacteroides
13,2346,2077,5607,10518,4452,ZTTL3YH,HMP,C,M,45.43,33.3,IR,Bacteroides,Bacteroides,Bacteroides,,
18,6088,7673,2193,553,3493,ZLPZS0H,HMP,A,M,51.13,29.31,IR,Lachnospiraceae,Ruminococcus faecis,,Anaerostipes hadrus/Eubacterium,
10,4220,4022,3111,565,3323,ZQMVU4Q,HMP,C,F,52.94,23.31,IR,Bacteroides,Clostridium,Flavonifractor plautii,Akkermansia muciniphila,Bacteroides uniformis
16,4600,2048,1932,5491,4144,ZN9YTFN,HMP,H,F,60.18,32.21,IS,,Bacteroides uniformis,Firmicutes,,
14,6370,3116,10466,3103,1945,ZUL3S6O,"HMP, Exercise",C,M,56.73,24.66,IS,,,,Bacteroides,


### Creates file that does not include cluster information, attempted to use for QIIME analysis but was not the correct format

In [16]:
qiime_metadata = viz_data.drop(['cluster 1', 'cluster 2', 'cluster 3', 'cluster 4', 'cluster 5', 'Study', 'C5 species', 'C1 species', 'C2 species', 'C3 species', 'C4 species'], axis=1)

In [18]:
qiime_metadata.to_csv('qiimemetadata.tsv', sep='\t', index=False)