In [1]:
import csv
import pandas as pd
import os
import numpy as np

### Set file path

In [2]:
## data file and directory path
data_dir = '/global/cfs/cdirs/kbase/KE-Catboost/ziming/GO/data/go_aggregated_4.1'
## input file
input_file_tsv = os.path.join(data_dir, 'go_aggregated_4.1.tsv')
input_file_pickle = os.path.join(data_dir, 'go_aggregated_4.1.pkl')
mixed_label_file = os.path.join(data_dir, 'MGnify_root_mixed_worksheet-Full.tsv')
## output file 
output_file_tsv = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated.tsv')
output_file_pickle = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated.pkl')

### Load original dataset

In [3]:
%%time
if os.path.exists(input_file_pickle):
    df = pd.read_pickle(input_file_pickle)
else:
    with open(input_file_tsv, 'r') as f:
        df = pd.read_csv(f, sep="\t")
        df.to_pickle(input_file_pickle)
df

CPU times: user 449 ms, sys: 1.27 s, total: 1.72 s
Wall time: 1.86 s


Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
19,ERZ650344,MGYS00003358,SRS3210273,root:Environmental:Aquatic:Marine:Brackish,assembly,4.1,3,0,0,342,...,2,79,90,0,20,0,0,0,87,38
27,ERZ747221,MGYS00004737,SRS1791943,root:Engineered:Biogas plant,assembly,4.1,49,0,0,450,...,2,110,0,0,15,0,0,0,204,34
29,ERZ761615,MGYS00003390,SRS1984850,root:Engineered:Bioreactor,assembly,4.1,2,0,0,270,...,33,102,2,0,43,0,0,0,93,49
30,ERR2193276,MGYS00005065,ERS2001073,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,26,...,0,12,1,0,15,0,0,0,8,5
31,ERR2193277,MGYS00005065,ERS2001074,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,31,...,0,13,1,0,20,0,0,0,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73785,ERZ807429,MGYS00005443,ERS1960449,root:Host-associated:Human:Digestive system:Oral,assembly,4.1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,1
74255,ERZ795020,MGYS00004901,SRS785549,root:Engineered:Wastewater,assembly,4.1,0,0,0,115,...,1,10,0,0,1,0,0,0,41,8
74256,ERZ505245,MGYS00003223,SRS1589217,root:Environmental:Aquatic:Marine,assembly,4.1,1,0,0,336,...,3,54,1,0,7,0,0,0,88,56
74261,ERZ505291,MGYS00003237,SRS1589396,root:Environmental:Aquatic:Marine,assembly,4.1,2,0,0,299,...,2,58,0,0,11,0,0,0,74,53


### Load biomes mapping file

In [4]:
with open(mixed_label_file, 'r') as f:
    df_ = pd.read_csv(f, sep="\t")
    df_mixed_label = df_.where(pd.notnull(df_), 'Null')
df_mixed_label

Unnamed: 0,id,top-level,environment-biome,environment-feature,environment-material
0,ERS3792294,root:Mixed,Sediment,Sediment,Sediment
1,SRS589645,root:Mixed,Soil,Null,Null
2,SRS589650,root:Mixed,Soil,Null,Null
3,SRS589658,root:Mixed,Soil,Null,Null
4,SRS589669,root:Mixed,plant,Null,Null
...,...,...,...,...,...
17629,SRS3293561,root:Mixed,Null,Null,Null
17630,ERS2584481,root:Mixed,temperate grassland,soil,Caterpillar
17631,ERS2468507,root:Mixed,Null,Null,Null
17632,SRS3293559,root:Mixed,Null,Null,Null


### Update biomes

In [5]:
## build a dictionary, sample id as key and biome as value
## filter out Null value biome 
mixed_label_dic = {}
for i, row in df_mixed_label.iterrows():
    labels_without_null = [i for i in row[1:] if i != 'Null']
    mixed_label_dic[row[0]] = ':'.join(labels_without_null)
mixed_label_dic

{'ERS3792294': 'root:Mixed:Sediment:Sediment:Sediment',
 'SRS589645': 'root:Mixed:Soil',
 'SRS589650': 'root:Mixed:Soil',
 'SRS589658': 'root:Mixed:Soil',
 'SRS589669': 'root:Mixed:plant',
 'SRS589678': 'root:Mixed:plant',
 'SRS589838': 'root:Mixed:Soil',
 'SRS589840': 'root:Mixed:Soil',
 'SRS589864': 'root:Mixed:Soil',
 'ERS2304136': 'root:Mixed:urban biome:building:dust',
 'ERS2301850': 'root:Mixed:village biome:human-associated habitat:sebum',
 'ERS2301922': 'root:Mixed:village biome:animal-associated habitat:saliva',
 'ERS2301672': 'root:Mixed:village biome:human-associated habitat:sebum',
 'ERS2309271': 'root:Mixed:urban biome:building:dust',
 'ERS2301533': 'root:Mixed:village biome:human-associated habitat:sebum',
 'ERS2301555': 'root:Mixed:village biome:human-associated habitat:mucus',
 'ERS2301556': 'root:Mixed:village biome:human-associated habitat:saliva',
 'ERS2301557': 'root:Mixed:village biome:human-associated habitat:sebum',
 'ERS2301558': 'root:Mixed:village biome:human-

In [6]:
## extract all samples with biome 'root:Mixed'
df_mixed_biome = df[df['biome'] == 'root:Mixed']
df_mixed_biome

Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
23065,ERR3572850,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,49,0,0,3678,...,0,820,24,0,223,0,0,0,1275,454
23066,ERR3572860,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,38,0,0,2647,...,0,615,39,0,135,0,0,0,964,359
23067,ERR3572870,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,63,0,0,3199,...,0,726,80,0,243,0,0,0,1097,424
23068,ERR3572880,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,46,0,0,2676,...,0,570,144,0,183,0,0,0,894,347
23069,ERR3572900,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,38,0,0,3206,...,0,712,30,0,266,0,0,0,1097,424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25054,ERR3574889,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,0,0,0,1815,...,0,321,10,0,3,0,0,0,695,326
25055,ERR3574899,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,0,0,0,2080,...,0,363,9,0,5,0,0,0,729,382
25056,ERR3574909,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,0,0,0,1357,...,0,170,5,0,30,0,0,0,1074,62
25057,ERR3574919,MGYS00005495,ERS3792294,root:Mixed,metagenomic,4.1,1,0,0,1411,...,0,147,4,0,22,0,0,0,1076,45


In [7]:
## update targeted biomes in origianl dataset
for i, row in df_mixed_biome.iterrows():
    if row['sample_id'] in mixed_label_dic:
        df.loc[i, 'biome'] = mixed_label_dic[row['sample_id']]
df

Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
19,ERZ650344,MGYS00003358,SRS3210273,root:Environmental:Aquatic:Marine:Brackish,assembly,4.1,3,0,0,342,...,2,79,90,0,20,0,0,0,87,38
27,ERZ747221,MGYS00004737,SRS1791943,root:Engineered:Biogas plant,assembly,4.1,49,0,0,450,...,2,110,0,0,15,0,0,0,204,34
29,ERZ761615,MGYS00003390,SRS1984850,root:Engineered:Bioreactor,assembly,4.1,2,0,0,270,...,33,102,2,0,43,0,0,0,93,49
30,ERR2193276,MGYS00005065,ERS2001073,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,26,...,0,12,1,0,15,0,0,0,8,5
31,ERR2193277,MGYS00005065,ERS2001074,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,31,...,0,13,1,0,20,0,0,0,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73785,ERZ807429,MGYS00005443,ERS1960449,root:Host-associated:Human:Digestive system:Oral,assembly,4.1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,1
74255,ERZ795020,MGYS00004901,SRS785549,root:Engineered:Wastewater,assembly,4.1,0,0,0,115,...,1,10,0,0,1,0,0,0,41,8
74256,ERZ505245,MGYS00003223,SRS1589217,root:Environmental:Aquatic:Marine,assembly,4.1,1,0,0,336,...,3,54,1,0,7,0,0,0,88,56
74261,ERZ505291,MGYS00003237,SRS1589396,root:Environmental:Aquatic:Marine,assembly,4.1,2,0,0,299,...,2,58,0,0,11,0,0,0,74,53


In [8]:
## extract samples whose biome is bot 'root:Mixed'
df_ = df[df['biome']!='root:Mixed']

In [9]:
## save them to files
df_.to_csv(output_file_tsv, sep = '\t')
df_.to_pickle(output_file_pickle)

In [10]:
df_

Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
19,ERZ650344,MGYS00003358,SRS3210273,root:Environmental:Aquatic:Marine:Brackish,assembly,4.1,3,0,0,342,...,2,79,90,0,20,0,0,0,87,38
27,ERZ747221,MGYS00004737,SRS1791943,root:Engineered:Biogas plant,assembly,4.1,49,0,0,450,...,2,110,0,0,15,0,0,0,204,34
29,ERZ761615,MGYS00003390,SRS1984850,root:Engineered:Bioreactor,assembly,4.1,2,0,0,270,...,33,102,2,0,43,0,0,0,93,49
30,ERR2193276,MGYS00005065,ERS2001073,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,26,...,0,12,1,0,15,0,0,0,8,5
31,ERR2193277,MGYS00005065,ERS2001074,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,31,...,0,13,1,0,20,0,0,0,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73785,ERZ807429,MGYS00005443,ERS1960449,root:Host-associated:Human:Digestive system:Oral,assembly,4.1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,1
74255,ERZ795020,MGYS00004901,SRS785549,root:Engineered:Wastewater,assembly,4.1,0,0,0,115,...,1,10,0,0,1,0,0,0,41,8
74256,ERZ505245,MGYS00003223,SRS1589217,root:Environmental:Aquatic:Marine,assembly,4.1,1,0,0,336,...,3,54,1,0,7,0,0,0,88,56
74261,ERZ505291,MGYS00003237,SRS1589396,root:Environmental:Aquatic:Marine,assembly,4.1,2,0,0,299,...,2,58,0,0,11,0,0,0,74,53
