## emp500_s3_make_mapping_files_prep_info.ipynb

### Overview

This notebook takes a single sample information (metadata) file, a general prep information file, and amplicon target-specific prep information files. It generates a merged prep information file and Qiime mapping file for each amplicon target. A single Qiita study ID will be prepended to sample names when the mega-study goes into Qiita.

In [1]:
import pandas as pd

In [2]:
def generate_prep_and_map(df_prep_target, df_prep_general, df_sample, path_prep, path_map):

    # merge target-specific and general prep info
    prep = pd.merge(df_prep_target, df_prep_general, left_index=True, right_index=True, how='inner')
    # rename #SampleID to sample_name
    prep.index.names = ['sample_name']
    # write prep to tsv
    prep.to_csv(path_prep, sep='\t')

    # DROP UNNECESSARY PREP COLUMNS HERE -- sample_name_plus_plate is duplicated in mapping
    prep.drop(['sample_name_plus_plate'], axis=1, inplace=True)
    
    # merge prep info and sample info, add #SampleID and Description column (for Qiime), and write to tsv
    mapping = pd.merge(prep, df_sample, left_index=True, right_index=True, how='inner')
    mapping.index.names = ['#SampleID']
    mapping['Description'] = mapping['sample_name_plus_plate'] + '_' + mapping['primer_name']
    mapping.to_csv(path_map, sep='\t')

In [3]:
# input files: sample information (needs more metadata) and prep information (invariant & target-specific)
path_samp_info = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_sample_information.tsv'
path_prep_general = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_general.xlsx'
path_prep_16s1 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_16s_plates1thru5.xlsx'
path_prep_16s2 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_16s_plates6thru9.xlsx'
path_prep_its1 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_its_plates1thru5.xlsx'
path_prep_its2 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_its_plates6thru9.xlsx'
path_prep_18s1 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_18s_plates1and5.xlsx'
path_prep_18s2 = '/Users/luke.thompson/emp/500-metadata/input-sample-prep-info/emp500_prep_information_18s_plates2thru4.xlsx'

# output files: mapping files and prep information files
prep_16s1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_16s_prep_info_plates1thru5.tsv'
prep_16s2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_16s_prep_info_plates6thru9.tsv'
prep_its1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_its_prep_info_plates1thru5.tsv'
prep_its2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_its_prep_info_plates6thru9.tsv'
prep_18s1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_18s_prep_info_plates1and5.tsv'
prep_18s2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_18s_prep_info_plates2thru4.tsv'

map_16s1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_16s_mapping_file_plates1thru5.tsv'
map_16s2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_16s_mapping_file_plates6thru9.tsv'
map_its1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_its_mapping_file_plates1thru5.tsv'
map_its2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_its_mapping_file_plates6thru9.tsv'
map_18s1 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_18s_mapping_file_plates1and5.tsv'
map_18s2 = '/Users/luke.thompson/emp/500-metadata/output-mapping-prep-info/emp500_18s_mapping_file_plates2thru4.tsv'

In [4]:
# read in files
df_samp_info = pd.read_csv(path_samp_info, index_col=0, sep='\t')
df_prep_general = pd.read_excel(path_prep_general, index_col=0)
df_prep_16s1 = pd.read_excel(path_prep_16s1, index_col=0)
df_prep_16s2 = pd.read_excel(path_prep_16s2, index_col=0)
df_prep_its1 = pd.read_excel(path_prep_its1, index_col=0)
df_prep_its2 = pd.read_excel(path_prep_its2, index_col=0)
df_prep_18s1 = pd.read_excel(path_prep_18s1, index_col=0)
df_prep_18s2 = pd.read_excel(path_prep_18s2, index_col=0)

In [5]:
# write out files
generate_prep_and_map(df_prep_target=df_prep_16s1, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_16s1, path_map=map_16s1)
generate_prep_and_map(df_prep_target=df_prep_16s2, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_16s2, path_map=map_16s2)
generate_prep_and_map(df_prep_target=df_prep_its1, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_its1, path_map=map_its1)
generate_prep_and_map(df_prep_target=df_prep_its2, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_its2, path_map=map_its2)
generate_prep_and_map(df_prep_target=df_prep_18s1, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_18s1, path_map=map_18s1)
generate_prep_and_map(df_prep_target=df_prep_18s2, df_prep_general=df_prep_general, df_sample=df_samp_info,
                      path_prep=prep_18s2, path_map=map_18s2)