In [1]:
import os
import pandas as pd

%matplotlib inline

In [2]:
study = ('ana-flavia-NCxNR', 'ana-flavia-STD-NCxSTD-NR', 'ana-flavia-HSD-NCxHSD-NR', 'ana-flavia-STD-NCxHSD-NC', 'ana-flavia-STD-NRxHSD-NR',)
study = study[4]
data_path = os.path.join('..', 'data', 'raw')

## Data ingestion

In [3]:
samples_names_fpath = os.path.join(data_path, 'sample_id-sample_name.tsv')
samples_names_df =  pd.read_csv(samples_names_fpath, delimiter='\t')
samples_names_df['sample-id'] = samples_names_df['sample-id'].astype('int64')
print(samples_names_df)

       sample-id sample-name
0   210421121673    #1 N9.01
1   210421121674   #2 N11.01
2   210421121675    #3 N9.05
3   210421121676   #4 N11.02
4   210421121677    #5 N9.03
..           ...         ...
69  210707163913  #70 7d.AM2
70  210707163914   #71 7d.C3
71  210707163915   #72 7d.M3
72  210707163916   #73 7d.A3
73  210707163917  #74 7d.AM3

[74 rows x 2 columns]


In [4]:
samples_paths_fpath = os.path.join(data_path, 'samples-paths.tsv')
samples_paths_df =  pd.read_csv(samples_paths_fpath, delimiter='\t')
samples_paths_df['sample-id'] = samples_paths_df['sample-id'].astype('int64')
print(samples_paths_df)

        sample-id                                  absolute-filepath direction
0    210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
1    210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
2    210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
3    210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
4    210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
..            ...                                                ...       ...
143  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
144  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
145  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
146  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
147  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse

[148 rows x 3 columns]


## Data transform

Combine Dataframes and organize all sample data in one DataFrame.

In [5]:
rows = list()
cols = ['sample-id', 'forward-path', 'reverse-path']
df = samples_paths_df
for i in range(0,len(df.index),2):
    r1 = df.iloc[i,:]
    r2 = df.iloc[i+1,:]
    new_row = (r1[0], r1[1], r2[1])
    rows.append(new_row)
new_df = pd.DataFrame(rows, columns=cols)
new_df['sample-id'] = new_df['sample-id'].astype('int64')
print(new_df)

       sample-id                                       forward-path  \
0   210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   210421121676  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   210421121677  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..           ...                                                ...   
69  210707163913  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  210707163914  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
73  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   

                                         reverse-path  
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...  
1   /home/lauro/nupeb/dados_brutos_

In [6]:
joined_df = pd.concat([samples_names_df, new_df], axis=1)
joined_df = joined_df.loc[:,~joined_df.columns.duplicated()].copy()
print(joined_df)

       sample-id sample-name  \
0   210421121673    #1 N9.01   
1   210421121674   #2 N11.01   
2   210421121675    #3 N9.05   
3   210421121676   #4 N11.02   
4   210421121677    #5 N9.03   
..           ...         ...   
69  210707163913  #70 7d.AM2   
70  210707163914   #71 7d.C3   
71  210707163915   #72 7d.M3   
72  210707163916   #73 7d.A3   
73  210707163917  #74 7d.AM3   

                                         forward-path  \
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..                                                ...   
69  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  /home/lauro/nupeb/dados_brutos_rede_genom

In [7]:
manifest_df = joined_df[['sample-id', 'forward-path', 'reverse-path']]
print(manifest_df)

       sample-id                                       forward-path  \
0   210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   210421121676  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   210421121677  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..           ...                                                ...   
69  210707163913  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  210707163914  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
73  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   

                                         reverse-path  
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...  
1   /home/lauro/nupeb/dados_brutos_

## Load data

Save new manifest as a TSV file.

In [8]:
fpath = os.path.join(data_path, 'karina-samples-groups.tsv')
manifest_df.to_csv(fpath, sep=',', index=False)

### Data reduction

Select context group samples.

In [9]:
groups_path = os.path.join(data_path, f'{study}.tsv')
study_df = pd.read_csv(groups_path, sep='\t')
key = ('sample-name', )
study_df = study_df.join(joined_df.set_index(key[0]), on=key[0])
study_df

Unnamed: 0,sample-name,group-id,group-desc,sample-id,forward-path,reverse-path
0,#13 N7.03,STD-NR,Grupo dieta padrão,210421121685,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
1,#14 N7.04,STD-NR,Grupo dieta padrão,210421121686,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
2,#15 N8.03,STD-NR,Grupo dieta padrão,210421121687,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
3,#31 N1.02,STD-NR,Grupo dieta padrão,210421121703,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
4,#32 N1.03,STD-NR,Grupo dieta padrão,210421121704,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
5,#33 N2.02,STD-NR,Grupo dieta padrão,210421121705,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
6,#34 N2.03,STD-NR,Grupo dieta padrão,210421121706,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
7,#19 N7.07,HSD-NR,Grupo dieta high-sugar,210421121691,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
8,#20 N7.07,HSD-NR,Grupo dieta high-sugar,210421121692,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
9,#21 N8.08,HSD-NR,Grupo dieta high-sugar,210421121693,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...


In [10]:
selected_cols = ['sample-id', 'sample-name', 'sample-group']
selected_cols = ['sample-id', 'sample-name', 'group-id', 'group-desc']
metadata_df = study_df[selected_cols]
metadata_df

Unnamed: 0,sample-id,sample-name,group-id,group-desc
0,210421121685,#13 N7.03,STD-NR,Grupo dieta padrão
1,210421121686,#14 N7.04,STD-NR,Grupo dieta padrão
2,210421121687,#15 N8.03,STD-NR,Grupo dieta padrão
3,210421121703,#31 N1.02,STD-NR,Grupo dieta padrão
4,210421121704,#32 N1.03,STD-NR,Grupo dieta padrão
5,210421121705,#33 N2.02,STD-NR,Grupo dieta padrão
6,210421121706,#34 N2.03,STD-NR,Grupo dieta padrão
7,210421121691,#19 N7.07,HSD-NR,Grupo dieta high-sugar
8,210421121692,#20 N7.07,HSD-NR,Grupo dieta high-sugar
9,210421121693,#21 N8.08,HSD-NR,Grupo dieta high-sugar


In [11]:
manifest_df.columns

Index(['sample-id', 'forward-path', 'reverse-path'], dtype='object')

In [12]:
manifest_df = study_df[['sample-id', 'forward-path', 'reverse-path']]
manifest_df= manifest_df.rename(columns = {'sample-id': '#SampleID'})
headers = [manifest_df.columns, ['#q2:types', 'categorical', 'categorical']]
manifest_df.columns = headers
manifest_df

Unnamed: 0_level_0,#SampleID,forward-path,reverse-path
Unnamed: 0_level_1,#q2:types,categorical,categorical
0,210421121685,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
1,210421121686,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
2,210421121687,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
3,210421121703,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
4,210421121704,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
5,210421121705,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
6,210421121706,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
7,210421121691,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
8,210421121692,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
9,210421121693,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...


## Load Reduced Data

In [13]:
metadata_path = os.path.join(data_path, 'metadata', f'metadata-{study}.tsv')
metadata_df.to_csv(metadata_path, sep='\t', index=False)

manifest_path = os.path.join(data_path, 'manifest', f'manifest-{study}.csv')
manifest_df.to_csv(manifest_path, sep=',', index=False)

In [14]:
manifest_02_df = samples_paths_df[samples_paths_df['sample-id'].isin(manifest_df['#SampleID']['#q2:types'])]
manifest_02_df

Unnamed: 0,sample-id,absolute-filepath,direction
24,210421121685,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
25,210421121685,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
26,210421121686,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
27,210421121686,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
28,210421121687,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
29,210421121687,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
36,210421121691,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
37,210421121691,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
38,210421121692,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
39,210421121692,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse


In [15]:
manifest_path = os.path.join(data_path, 'manifest', f'manifest-{study}.csv')
manifest_02_df.to_csv(manifest_path, sep=',', index=False)