### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS

#### Protocol
Please fill out a template excel and produce a empty workbook with get_fields_info with all needed sheets.
This script will generate basic items for the types listed in the TEMPORARY dictionary (+ experiment). 

#### Assumptions
* There is a single experiment type mentioned in the workbook, and all experiments in the workbook uses the same experimental protocol, and biosample protocol. It will generate a new one for each experiment type. If you want to replace it with an existing one, you can do a replace_all on the final excel, and delete the unused protocol.
* Treatment is Treatment agent, if it is RNAi, please swap them on the final sheet

#### ToDo
* script generates the empty excel too using the keyfile and get_fields_info
* script accepts existing protocols, or generates protocols for each set


In [None]:

from dcicutils import ff_utils
from functions.notebook_functions import *
import xlrd
import xlwt
import json

# get key from keypairs.json
my_key = get_key('')
schema_name = get_schema_names(my_key) 

# template excel from submit4dn
excel_file = '/Users/user/Desktop/Templating/MetadataSheets.xls'

# summary excel
summary_file = '/Users/user/Desktop/Summary.xlsx'

#alias pre text
submitter_lab = '4dn-dcic-lab:'

# is this a cell_line that needs biosample_cell_culture_details
add_bcc = True

# is there a publication that covers this experiments, use the same id as we have on publication items
# doi:...  pmid:...
publication_id = "sample id"

#project_name to be used for protocol and publication alias
project = 'BAF180'

# which experiment type
# ChIA-PET, PLAC-seq, ATAC-seq, Repli-seq, single cell Repli-seq, 
# TSA-seq, capture Hi-C, ChIP-seq, RNA-seq, NAD-seq, DNA SPRITE, RNA-DNA SPRITE, MARGI,
# GAM, CUT&RUN, TrAC-loop, TRIP, in situ Hi-C, dilution Hi-C, micro-C, DNase Hi-C, TCC,
# cryomilling TCC, single cell Hi-C, sci-Hi-C, MC-3C, MC-Hi-C
experiment_type = "RNA-seq"
exp_schema = "experiment_seq"

In [None]:
# generate pandas df
import pandas as pd
# turn excel to pandas dataframe
df_sum= pd.read_excel(summary_file, index_col=None)
# create a new index
df_sum.reset_index(inplace=True)

# cleanup column names, discard part after paranthesis
column_names = df_sum.columns.values
new_names = []
for i in column_names:
    new_names.append(i.split('(')[0].strip().lower())   
df_sum.columns = new_names

# convert number to int to string
for a_col in new_names:
    dt = df_sum[a_col].dtype
    if a_col == 'index':
        continue
    try:
        df_sum[a_col] = df_sum[a_col].fillna(-1)
        df_sum[a_col] = df_sum[a_col].astype(int)
        df_sum[a_col] = df_sum[a_col].astype(str)
        df_sum[a_col] = df_sum[a_col].replace('-1', '')
    except:
        df_sum[a_col] = df_sum[a_col].fillna(-1)
        df_sum[a_col] = df_sum[a_col].astype(str)
        df_sum[a_col] = df_sum[a_col].replace('-1', '')

        
# convert pandas dataframe to python list of dict
sum_list = df_sum.to_dict(orient='records')
print('dataframe created with columns', df_sum.columns.values)
print(len(sum_list), "lines on summary file")

# TEMPLATE STORE DICT
TEMP = {'protocol': [], 
        'file_fastq': [],
        'modification': [],
        'treatment_agent':[],
        'biosource': [],
        'biosample': [],
        exp_schema: [],
        'experiment_set_replicate': [],
        'publication':[]}

if add_bcc:
    TEMP['biosample_cell_culture'] = []

# Generate 2 protocols
experiment_protocol = submitter_lab + 'protocol_experiment_' + project
biosample_protocol = submitter_lab + 'protocol_biosample_' + project
TEMP['protocol'].extend(
    [{'aliases':[experiment_protocol], "protocol_type": "Experimental protocol"},
    {'aliases':[biosample_protocol], "protocol_type":"Cell culture protocol"}])

def add_to_dict(pre_alias, typ, main_dict, other_val={}):
    dict_to_add = {}
    # skip duplicate additions (can happen for biosample, bcc, biosource)
    if pre_alias in [i['aliases'][0] for i in main_dict[typ]]:
        return main_dict
    dict_to_add['aliases'] = [pre_alias,]
    if other_val:
        for i in other_val:
            dict_to_add[i] = other_val[i]
    main_dict[typ].append(dict_to_add)
    return main_dict

all_sets = []
for num, line in enumerate(sum_list): 
    #ADD SET
    if line.get('experiment sets'):
        tag = line['experiment sets']
        my_set = submitter_lab + line['experiment sets']
        print(my_set)
        all_sets.append(my_set)
        extra = {'description':line['set description']}
        if line.get('dataset'):
            extra['dataset_label'] = line['dataset']
        if line.get('condition'):
            extra['condition'] = line['condition']
            
        TEMP = add_to_dict(my_set, 'experiment_set_replicate', TEMP, extra)
        
            
    
    #Bio Tec rep add on
    b_add = tag + "_B{}".format(line['biological replicate number'])
    bt_add = tag + "_B{}_T{}".format(line['biological replicate number'], line['technical replicate number'])
    
    # Deal with biosource
    bio_source = ""
    try:
        bios = ff_utils.get_metadata(line['biosource'], my_key)
        bio_source = line['biosource']
    except:
        # new biosource
        bio_source = submitter_lab + 'biosource_' +line['biosource'].lower().replace(" ", "_")
        TEMP = add_to_dict(bio_source, 'biosource', TEMP)

    #ADD BCC
    if add_bcc:
        my_bcc = submitter_lab + 'bcc_' + b_add
        TEMP = add_to_dict(my_bcc, 'biosample_cell_culture' , TEMP)
    
    #ADD MODIFICATION
    if line.get('modification alias'):
        my_mod = submitter_lab + 'mod_' + line['modification alias']
        extra = {'description': line.get('modification description', "")}
        TEMP = add_to_dict(my_mod, 'modification' , TEMP, extra)
    
    #ADD TREATMENT
    if line.get('treatment alias'):
        my_treatment = submitter_lab + 'treatment_' + line['treatment alias']
        extra = {'description': line.get('treatment description', "")}
        TEMP = add_to_dict(my_treatment, 'treatment_agent' , TEMP, extra)
    

    #ADD BIOSAMPLE
    my_biosample = submitter_lab+'biosample_' + b_add
    extra = {'biosource':bio_source, 'biosample_protocols':biosample_protocol}
    if add_bcc:
        extra['cell_culture_details'] = my_bcc
    if line.get('modification alias'):
        extra['modifications'] = [my_mod,]
    if line.get('treatment alias'):
        extra['treatments'] = [my_treatment,]
    TEMP = add_to_dict(my_biosample, 'biosample', TEMP, extra)
    
    
    #DEFINE EXPERIMENT
    if line.get('experiment'):
        my_exp = submitter_lab +  'experiment_' + line['experiment'] + '_' + bt_add
    else:
        my_exp = submitter_lab + 'experiment_' + bt_add
    
    # ADD FILES
    raw_files = []
    proc_files = []
    
    if line.get('number of raw files'):
        for n in range(int(line['number of raw files'])):
            raw_f_alias = submitter_lab + 'filefastq_' + bt_add + '_f' + str(n+1)
            extra = {'file_format': 'fastq'}
            TEMP = add_to_dict(raw_f_alias, 'file_fastq', TEMP, extra)
            raw_files.append(raw_f_alias)
    
    if line.get('number of raw pairs'):
        for n in range(int(line['number of raw pairs'])):
            raw_f_alias1 = submitter_lab + 'filefastq_' + bt_add + '_f' + str(n+1) + '_r1'
            raw_f_alias2 = submitter_lab + 'filefastq_' + bt_add + '_f' + str(n+1) + '_r2'
            extra1 = {'file_format': 'fastq',
                      "paired_end": "1", 
                      "related_files": [{"file": raw_f_alias2, "relationship_type": "paired with"}]}
            extra2 = {'file_format': 'fastq',
                      "paired_end": "2", 
                      "related_files": [{"file": raw_f_alias1, "relationship_type": "paired with"}]}
            TEMP = add_to_dict(raw_f_alias1, 'file_fastq', TEMP, extra1)
            TEMP = add_to_dict(raw_f_alias2, 'file_fastq', TEMP, extra2)
            raw_files.append(raw_f_alias1)
            raw_files.append(raw_f_alias2)

    
    #ADD EXPERIMENT
    extra = {'replicate_set':my_set, 'protocol':experiment_protocol, 'biosample':my_biosample,
             'experiment_type': experiment_type}
    if raw_files:
        extra['files'] = raw_files
    extra['bio_rep_no'] = line['biological replicate number'] 
    extra['tec_rep_no'] = line['technical replicate number']
    TEMP = add_to_dict(my_exp, exp_schema, TEMP, extra)
    
# ADD PUBLICATION
if publication_id:
    # check if it exists
    try:
        pub = ff_utils.get_metadata(publication_id, my_key, add_on='frame=raw')
        pub_sets = pub.get('exp_sets_prod_in_pub', [])
        pub_alias = pub.get('aliases',[""])
        my_pub = pub_alias[0]
        pub_sets.extend(all_sets)
        extra = {'ID':publication_id ,'exp_sets_prod_in_pub': pub_sets}
    except:
        # new publication
        my_pub = submitter_lab + 'publication_' + project
        extra = {'ID':publication_id ,'exp_sets_prod_in_pub': all_sets}
    TEMP = add_to_dict(my_pub, 'publication', TEMP, extra)

# call function to add items to a template excel
append_items_to_xls(excel_file, TEMP, schema_name, comment = False)