### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS

#### Protocol
Please fill out a template excel and produce a empty workbook with get_fields_info with all needed sheets.
This script will generate basic items for the types listed in the TEMPORARY dictionary (+ experiment). 

#### Assumptions
* There is a single experiment type mentioned in the workbook, and all experiments in the workbook uses the same experimental protocol, and biosample protocol. It will generate a new one for each experiment type. If you want to replace it with an existing one, you can do a replace_all on the final excel, and delete the unused protocol.
* Treatment is Treatment agent, if it is RNAi, please swap them on the final sheet

#### ToDo
* script generates the empty excel too using the keyfile and get_fields_info
* script accepts existing protocols, or generates protocols for each set


In [None]:

from dcicutils import ff_utils
from src.functions.notebook_functions import *
import xlrd
import xlwt
import json

# get key from keypairs.json
my_key = get_key('koray_data')
schema_name = get_schema_names(my_key) 

# project folder
my_folder = '/Users/koray/Desktop/wrangling/Templating/'
# template excel from submit4dn
excel_file = my_folder + 'mic_template.xls'
# summary excel
summary_file = my_folder + '190401_mic.xlsx'
#alias pre text
submitter_lab = 'joerg-bewersdorf-lab:'

# is this a cell_line that needs biosample_cell_culture_details
add_bcc = True

# is there a publication that covers this experiments, use the same id as we have on publication items
# doi:...  pmid:...
publication_id = "sample_id"

#project_name to be used for protocol and publication alias
project = 'test_project'

# which experiment type
# "DNA-paint", "DNA FISH", "RNA FISH", "FISH", "Immunofluorescence", "Fluorescence", "Cryo-EM", "SPT"
experiment_type = "DNA FISH"

microscope_settings = "a2"  # d1 d2 a1 a2

# input_file_relations for raw files
# expects numbers to be the same between keys to be related
# a key (first element of the list) can only be used once in that position at the moment due to subembedded structure
relations = [['raw', 'rendered', 'derived from'],
            ]

# user submitted workflows
# list of input output keys
# use andy's script
processing = [['raw', 'linked'],]


In [None]:
# generate pandas df
import pandas as pd
# turn excel to pandas dataframe
df_sum= pd.read_excel(summary_file, index_col=None)
# create a new index
df_sum.reset_index(inplace=True)

# cleanup column names, discard part after paranthesis
column_names = df_sum.columns.values
new_names = []
for i in column_names:
    new_names.append(i.split('(')[0].strip().lower())   
df_sum.columns = new_names

# convert number to int to string
for a_col in new_names:
    dt = df_sum[a_col].dtype
    if a_col == 'index':
        continue
    try:
        df_sum[a_col] = df_sum[a_col].fillna(-1)
        df_sum[a_col] = df_sum[a_col].astype(int)
        df_sum[a_col] = df_sum[a_col].astype(str)
        df_sum[a_col] = df_sum[a_col].replace('-1', '')
    except:
        df_sum[a_col] = df_sum[a_col].fillna(-1)
        df_sum[a_col] = df_sum[a_col].astype(str)
        df_sum[a_col] = df_sum[a_col].replace('-1', '')

        
# convert pandas dataframe to python list of dict
sum_list = df_sum.to_dict(orient='records')
print('dataframe created with columns', df_sum.columns.values)
print(len(sum_list), "lines on summary file")

# TEMPLATE STORE DICT

# exp type to item type converter {'ExperimentDamid': ['DAM-ID seq'], 'ExperimentMic': ['DNA-paint'...
pr = ff_utils.get_metadata('/profiles/', my_key)

TEMP = {'protocol': [], 
        'target': [],
        'imaging_path': [],
        'file_microscopy': [],
        'file_processed': [],
        'file_reference': [],
        'modification': [],
        'treatment_agent':[],
        'biosource': [],
        'biosample': [],
        'experiment_mic': [],
        'experiment_set_replicate': [],
        'publication':[]}


if microscope_settings:
    mic_set_type = 'microscope_setting_' + microscope_settings
    TEMP[mic_set_type] = []

if add_bcc:
    TEMP['biosample_cell_culture'] = []

# Generate 2 protocols
experiment_protocol = submitter_lab + 'protocol_experiment_' + project
biosample_protocol = submitter_lab + 'protocol_biosample_' + project
TEMP['protocol'].extend([
    {'aliases':[experiment_protocol], 
     "protocol_type": "Experimental protocol",
     'experiment_type': experiment_type},
    {'aliases':[biosample_protocol], 
     "protocol_type":"Cell culture protocol",
     'experiment_type': experiment_type}
])

def add_to_dict(pre_alias, typ, main_dict, other_val={}):
    dict_to_add = {}
    # skip duplicate additions (can happen for biosample, bcc, biosource)
    if pre_alias in [i['aliases'][0] for i in main_dict[typ]]:
        return main_dict
    dict_to_add['aliases'] = [pre_alias,]
    if other_val:
        for i in other_val:
            dict_to_add[i] = other_val[i]
    main_dict[typ].append(dict_to_add)
    return main_dict

    
all_sets = []
for num, line in enumerate(sum_list): 
    if microscope_settings:
        my_micsetting = submitter_lab + line['experiment sets'] + '_micsetting'
        TEMP[mic_set_type].extend([{'aliases': my_micsetting, 'tier_level': microscope_settings.upper()}])
    #ADD SET
    if line.get('experiment sets'):
        tag = line['experiment sets']
        my_set = submitter_lab + line['experiment sets']
        print(my_set)
        all_sets.append(my_set)
        TEMP = add_to_dict(my_set, 'experiment_set_replicate', TEMP, {'description':line['set description']})
    
    #Bio Tec rep add on
    b_add = tag + "_B{}".format(line['biological replicate number'])
    bt_add = tag + "_B{}_T{}".format(line['biological replicate number'], line['technical replicate number'])
    
    # Deal with biosource
    bio_source = ""
    try:
        bios = ff_utils.get_metadata(line['biosource'], my_key)
        bio_source = line['biosource']
    except:
        # new biosource
        bio_source = submitter_lab + 'biosource_' +line['biosource'].lower().replace(" ", "_")
        TEMP = add_to_dict(bio_source, 'biosource', TEMP)

    #ADD BCC
    if add_bcc:
        my_bcc = submitter_lab + 'bcc_' + b_add
        TEMP = add_to_dict(my_bcc, 'biosample_cell_culture' , TEMP)
    
    #ADD MODIFICATION
    if line.get('modification alias'):
        my_mod = submitter_lab + 'mod_' + line['modification alias']
        extra = {'description': line.get('modification description', "")}
        TEMP = add_to_dict(my_mod, 'modification' , TEMP, extra)
    
    #ADD TREATMENT
    if line.get('treatment alias'):
        my_treatment = submitter_lab + 'treatment_' + line['treatment alias']
        extra = {'description': line.get('treatment description', "")}
        TEMP = add_to_dict(my_treatment, 'treatment_agent' , TEMP, extra)
    

    #ADD BIOSAMPLE
    my_biosample = submitter_lab+'biosample_' + b_add
    extra = {'biosource':bio_source, 'biosample_protocols':biosample_protocol}
    if add_bcc:
        extra['cell_culture_details'] = my_bcc
    if line.get('modification alias'):
        extra['modifications'] = [my_mod,]
    if line.get('treatment alias'):
        extra['treatments'] = [my_treatment,]
    TEMP = add_to_dict(my_biosample, 'biosample', TEMP, extra)
    
    
    #DEFINE EXPERIMENT
    if line.get('experiment'):
        my_exp = submitter_lab +  'experiment_' + line['experiment'] + '_' + bt_add
    else:
        my_exp = submitter_lab + 'experiment_' + bt_add
    
    # ADD FILES
    raw_files = []
    proc_files = []
    ref_files = []
    other_proc_files = []
    
    if line.get('number of raw files'):
        raw_file_info = line['number of raw files']
        # is that field a number or comma sep list
        try:
            no_of_raw_files = int(line['number of raw files'])
            for n in range(no_of_raw_files):
                raw_f_alias = submitter_lab + 'filemic_' + bt_add + '_f' + str(n+1)
                TEMP = add_to_dict(raw_f_alias, 'file_microscopy', TEMP)
                raw_files.append(raw_f_alias)
        # if list of keys, numbers, add key to the alias
        except:
            raw_cases = [i.strip() for i in raw_file_info.split(',') if i]
            raw_cases = [[i.split(':')[0].strip(), int(i.split(':')[1].strip())] for i in raw_cases]
            # iterate over different tags
            for a_tag, no_of_raw_files in raw_cases:
                for n in range(no_of_raw_files):
                    raw_f_alias = submitter_lab + 'filemic_' + bt_add +'_' + a_tag + '_f' + str(n+1)
                    extra = {}
                    # is there a relation setup
                    if a_tag in [i[1] for i in relations]:
                        matching_cases = [i for i in relations if i[1] == a_tag]
                        assert len(matching_cases) == 1
                        matching_case = matching_cases[0]
                        extra['related_files'] = [{'relationship_type': matching_case[2],
                                                  'file': raw_f_alias.replace(a_tag, matching_case[0])}]
                    TEMP = add_to_dict(raw_f_alias, 'file_microscopy', TEMP, extra)
                    raw_files.append(raw_f_alias)
                    

    if line.get('number of reference files'):
        no_of_ref_files = int(line['number of reference files'])
        for n in range(no_of_ref_files):
            ref_alias = submitter_lab + 'fileref_' + bt_add + '_f' + str(n+1)
            TEMP = add_to_dict(ref_alias, 'file_reference', TEMP)
            ref_files.append(ref_alias)

                    
    if line.get('number of supplementary processed results'):
        no_of_op_files = int(line['number of supplementary processed results'])
        for n in range(no_of_op_files):
            opf_alias = submitter_lab + 'supplementary_' + bt_add + '_f' + str(n+1)
            TEMP = add_to_dict(opf_alias, 'file_processed', TEMP)
            other_proc_files.append(opf_alias)
       

    if line.get('number of processed results'):
        proc_file_info = line['number of processed results']
        # is that field a number or comma sep list
        try:
            no_of_proc_files = int(line['number of processed results'])
            for n in range(no_of_proc_files):
                proc_f_alias = submitter_lab + 'fileproc_' + bt_add + '_f' + str(n+1)
                TEMP = add_to_dict(proc_f_alias, 'file_processed', TEMP)
                proc_files.append(proc_f_alias)
        # if list of keys, numbers, add key to the alias
        except:
            proc_cases = [i.strip() for i in proc_file_info.split(',') if i]
            proc_cases = [[i.split(':')[0].strip(), int(i.split(':')[1].strip())] for i in proc_cases]
            # iterate over different tags
            for a_tag, no_of_proc_files in proc_cases:
                for n in range(no_of_proc_files):
                    proc_f_alias = submitter_lab + 'fileproc_' + bt_add +'_' + a_tag + '_f' + str(n+1)
                    extra = {}
                    # is there a 'produced from' setup
                    if a_tag in [i[1] for i in processing]:
                        matching_cases = [i for i in processing if i[1] == a_tag]
                        assert len(matching_cases) == 1
                        matching_case = matching_cases[0]
                        prod_from = proc_f_alias.replace('filemic_', 'fileproc_')
                        prod_from = prod_from.replace(a_tag, matching_case[0])
                        extra['produced_from'] = [prod_from,]
                    TEMP = add_to_dict(proc_f_alias, 'file_processed', TEMP, extra)
                    proc_files.append(proc_f_alias)
    
    # ADD IMAGING PATHS
    # Currently setup to work with 4 channels
    im_paths = {}
    sub_embed_tag = ["", "-1", "-2", "-3"]
    for channel in range(3):
        im_path = 'ch0{} short name'.format(str(channel))
        target = 'ch0{} target'.format(str(channel))
        if line.get(im_path):
            path_alias = submitter_lab + 'path_' + line[im_path].replace(" ", "_")
            extra = {}
            # check for target
            if line.get(target):
                target_alias = submitter_lab + 'target_' + line[target].replace(" ", "_")
                TEMP = add_to_dict(target_alias, 'target', TEMP)
                extra = {'target': target_alias}
            TEMP = add_to_dict(path_alias, 'imaging_path', TEMP, extra)
            ch_key = 'channel' + sub_embed_tag[channel]
            path_key = 'path' + sub_embed_tag[channel]      
            im_paths[ch_key] = 'ch0'+str(channel)
            im_paths[path_key] = path_alias
            
    
    #ADD EXPERIMENT
    extra = {'replicate_set':my_set, 'protocol':experiment_protocol, 'biosample':my_biosample,
             'experiment_type': experiment_type}
    if im_paths:
        extra['imaging_paths'] = [im_paths]
    if raw_files:
        extra['files'] = raw_files
    if proc_files:
        extra['processed_files'] = proc_files
    if ref_files:
        extra['reference_files'] = ref_files
    if other_proc_files:
        extra['other_processed_files'] = other_proc_files
    if microscope_settings:
        extra['microscope_settings_master'] = my_micsetting
        
        
    extra['bio_rep_no'] = line['biological replicate number'] 
    extra['tec_rep_no'] = line['technical replicate number']
    TEMP = add_to_dict(my_exp, exp_sheet, TEMP, extra)
    
# ADD PUBLICATION
if publication_id:
    # check if it exists
    try:
        pub = ff_utils.get_metadata(publication_id, my_key, add_on='frame=raw')
        pub_sets = pub.get('exp_sets_prod_in_pub', [])
        pub_alias = pub.get('aliases',[""])
        my_pub = pub_alias[0]
        pub_sets.extend(all_sets)
        extra = {'ID':publication_id ,'exp_sets_prod_in_pub': pub_sets}
    except:
        # new publication
        my_pub = submitter_lab + 'publication_' + project
        extra = {'ID':publication_id ,'exp_sets_prod_in_pub': all_sets}
    TEMP = add_to_dict(my_pub, 'publication', TEMP, extra)

# call function to add items to a template excel
append_items_to_xls(excel_file, TEMP, schema_name, comment = False)

In [None]:
print(TEMP['experiment_mic'][0])