In [1]:
import pandas as pd

%matplotlib inline

In [2]:
study = 'ana-flavia-hipotese-01'

## Data ingestion

In [3]:
samples_names_fpath = 'sample_id-sample_name.tsv'
samples_names_df =  pd.read_csv(samples_names_fpath, delimiter='\t')
samples_names_df['sample-id'] = samples_names_df['sample-id'].astype('int64')
print(samples_names_df)

       sample-id sample-name
0   210421121673    #1 N9.01
1   210421121674   #2 N11.01
2   210421121675    #3 N9.05
3   210421121676   #4 N11.02
4   210421121677    #5 N9.03
..           ...         ...
69  210707163913  #70 7d.AM2
70  210707163914   #71 7d.C3
71  210707163915   #72 7d.M3
72  210707163916   #73 7d.A3
73  210707163917  #74 7d.AM3

[74 rows x 2 columns]


In [4]:
samples_paths_fpath = 'samples-paths.tsv'
samples_paths_df =  pd.read_csv(samples_paths_fpath, delimiter='\t')
samples_paths_df['sample-id'] = samples_paths_df['sample-id'].astype('int64')
print(samples_paths_df)

        sample-id                                  absolute-filepath direction
0    210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
1    210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
2    210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
3    210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
4    210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
..            ...                                                ...       ...
143  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
144  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
145  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse
146  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   forward
147  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   reverse

[148 rows x 3 columns]


## Data transform

Combine Dataframes and organize all sample data in one DataFrame.

In [5]:
rows = list()
cols = ['sample-id', 'forward-path', 'reverse-path']
df = samples_paths_df
for i in range(0,len(df.index),2):
    r1 = df.iloc[i,:]
    r2 = df.iloc[i+1,:]
    new_row = (r1[0], r1[1], r2[1])
    rows.append(new_row)
new_df = pd.DataFrame(rows, columns=cols)
new_df['sample-id'] = new_df['sample-id'].astype('int64')
print(new_df)

       sample-id                                       forward-path  \
0   210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   210421121676  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   210421121677  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..           ...                                                ...   
69  210707163913  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  210707163914  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
73  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   

                                         reverse-path  
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...  
1   /home/lauro/nupeb/dados_brutos_

In [6]:
joined_df = pd.concat([samples_names_df, new_df], axis=1)
joined_df = joined_df.loc[:,~joined_df.columns.duplicated()].copy()
print(joined_df)

       sample-id sample-name  \
0   210421121673    #1 N9.01   
1   210421121674   #2 N11.01   
2   210421121675    #3 N9.05   
3   210421121676   #4 N11.02   
4   210421121677    #5 N9.03   
..           ...         ...   
69  210707163913  #70 7d.AM2   
70  210707163914   #71 7d.C3   
71  210707163915   #72 7d.M3   
72  210707163916   #73 7d.A3   
73  210707163917  #74 7d.AM3   

                                         forward-path  \
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..                                                ...   
69  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  /home/lauro/nupeb/dados_brutos_rede_genom

In [7]:
manifest_df = joined_df[['sample-id', 'forward-path', 'reverse-path']]
print(manifest_df)

       sample-id                                       forward-path  \
0   210421121673  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
1   210421121674  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
2   210421121675  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
3   210421121676  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
4   210421121677  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
..           ...                                                ...   
69  210707163913  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
70  210707163914  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
71  210707163915  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
72  210707163916  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   
73  210707163917  /home/lauro/nupeb/dados_brutos_rede_genoma/dat...   

                                         reverse-path  
0   /home/lauro/nupeb/dados_brutos_rede_genoma/dat...  
1   /home/lauro/nupeb/dados_brutos_

## Load data

Save new manifest as a TSV file.

In [8]:
manifest_df.to_csv(f'manifest-karina.csv', sep=',', index=False)

### Data reduction

Select context group samples.

In [9]:
study_groups_path = f'{study}.tsv'
study_df = pd.read_csv(study_groups_path, sep='\t')
study_df = study_df.join(joined_df.set_index('sample-name'), on='sample-name')
study_df

Unnamed: 0,sample-name,sample-group,sample-id,forward-path,reverse-path
0,#5 N9.03,NR,210421121677,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
1,#6 N10.03,NR,210421121678,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
2,#7 N10.04,NR,210421121679,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
3,#8 N10.07,NR,210421121680,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
4,#9 N10.08,NR,210421121681,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
5,#1 N9.01,NC,210421121673,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
6,#2 N11.01,NC,210421121674,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
7,#3 N9.05,NC,210421121675,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...
8,#4 N11.02,NC,210421121676,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...


In [10]:
metadata_df = study_df[['sample-id', 'sample-name', 'sample-group']]
metadata_df

Unnamed: 0,sample-id,sample-name,sample-group
0,210421121677,#5 N9.03,NR
1,210421121678,#6 N10.03,NR
2,210421121679,#7 N10.04,NR
3,210421121680,#8 N10.07,NR
4,210421121681,#9 N10.08,NR
5,210421121673,#1 N9.01,NC
6,210421121674,#2 N11.01,NC
7,210421121675,#3 N9.05,NC
8,210421121676,#4 N11.02,NC


In [11]:
manifest_df = study_df[['sample-id', 'forward-path', 'reverse-path']]
metadata_df.rename(columns = {'sample-id': '#SampleID'}, inplace=True)
headers = [metadata_df.columns, ['#q2:types', 'categorical', 'categorical']]
metadata_df.columns = headers
metadata_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0_level_0,#SampleID,sample-name,sample-group
Unnamed: 0_level_1,#q2:types,categorical,categorical
0,210421121677,#5 N9.03,NR
1,210421121678,#6 N10.03,NR
2,210421121679,#7 N10.04,NR
3,210421121680,#8 N10.07,NR
4,210421121681,#9 N10.08,NR
5,210421121673,#1 N9.01,NC
6,210421121674,#2 N11.01,NC
7,210421121675,#3 N9.05,NC
8,210421121676,#4 N11.02,NC


## Load Reduced Data

In [12]:
metadata_df.to_csv(f'metadata-{study}.tsv', sep='\t', index=False)
manifest_df.to_csv(f'manifest-{study}.csv', sep=',', index=False)

In [13]:
manifest_02_df = samples_paths_df[samples_paths_df['sample-id'].isin(manifest_df['sample-id'])]
manifest_02_df

Unnamed: 0,sample-id,absolute-filepath,direction
0,210421121673,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
1,210421121673,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
2,210421121674,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
3,210421121674,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
4,210421121675,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
5,210421121675,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
6,210421121676,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
7,210421121676,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse
8,210421121677,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,forward
9,210421121677,/home/lauro/nupeb/dados_brutos_rede_genoma/dat...,reverse


In [14]:
manifest_02_df.to_csv(f'manifest-02-{study}.csv', sep=',', index=False)