# Intro

This is the first script in the data preprocessing pathway. The final product is a tidy dataset annotated with the metadata defined in the metadata.json file.
All files associated with each round of AL are prefixed with the individual ID for that round of learning. e.g. ALTE001. 

# Order of Operations

## 1. The splitting of the chromatics.

* It imports a "_raw.csv" .csv raw data file from the BMG Omega platereader & the metadata .json.
* Determines how many chromatics have been measured, how many minutes it was recording for and moves the columns down into rows to separate out the data for each channel and gain.
* Saves the resulting format as .csv suffixed with "_parsed_data.csv"

## 2. 


In [1]:
import pandas as pd
import numpy as np
import math
import os, sys, shutil
import json

# Defining some functions

In [2]:


def print_all_df(df):
    """
    This function simply prints an entire pandas dataframe by changing the global pandas settings,
    printing and then reverting the settings.
    """
    # Permanently changes the pandas settings
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)
    
    # All dataframes hereafter reflect these changes.
    display(df)
    
    print('**RESET_OPTIONS**')
    
    # Resets the options
    pd.reset_option('all')
    



# Define the AL Round Prefix

This is the AL round ID that the script will use to determine with files to import and what to name the products.

e.g. "ALTE001"

In [3]:
prefix = "ALTE001"


# Define the end time

Enter the exact string of the last time point measured in the rawdata file.

In [4]:
end_time_string = "11 h 40 min"

# Import and exact the metadata

In [5]:
# navigate into the /app directory in the linux distribution of the docker container
os.chdir('/app')

# use the prefix to get the name of the metadata file
metadata_file_string = prefix + "_experiment_metadata.json"
metadata_file_string

'ALTE001_experiment_metadata.json'

In [6]:

# move into the metadata folder
os.chdir('experiment_metadata_files')
print(os.getcwd())

# Opening JSON file 
f = open(metadata_file_string) 

# returns JSON object as a dictionary 
experiment_metadata = json.load(f) 

# unpack dictionary into sub dictionaries and variables
well_designation = experiment_metadata['well_designation']

# save various directory elements as named variables
metadataheader = experiment_metadata['metadataheader']
chromatics_designations = experiment_metadata['chromatics_designations']
gap = experiment_metadata['gap']
metadata = experiment_metadata['metadata']

/app/experiment_metadata_files


# Import and extract the raw data

This cell basically imports the raw data, splits it up a bit and determines:

* The number of chromatics
* The last time point measured

In [7]:
# use the prefix to get the raw data file name
rawdata_file_string = prefix + "_raw.csv"

# Navigate into the raw data directory
os.chdir('/app/data/raw_data_files/')

# import the raw data
df = pd.read_csv(rawdata_file_string, header=None)

# segregate the metadata
df_head = df.iloc[:4,:]


df_head = df_head.set_axis(np.arange(0,df_head.shape[1],1), axis=1)

# these are the actual data
df_samples = df.iloc[5:,:2]

# gets the row that the time is in
timelist = df.iloc[6,:]
timelist = timelist.reset_index(drop=True)

last_timepoint = timelist.iloc[-1]

# gets the number of chromatics by counting how many times the times are repeated
number_of_chromatics = timelist.value_counts()[0]



# Converts the time string into a integer minute list.
The plate reader spits out the timepoints in strings that come in two different formats based on god knows what reason. Regardless, if it's the '0 h 0 min' format then there's a long winded and complicated parser. If it's the ' 0 mins' format, it's much more straight forward.

In [8]:

# check to see which time format the data is in.

# if it's '0 h 0 min'....
if (last_timepoint.__contains__('h')):
    
    
    print("I think the time format is '0 h 0 min'")
    
    
    print(timelist[timelist == '0 h '].index)
    print('length of index thing')
    print(len(timelist[timelist == '0 h '].index))
    print('')

    chromatics_list = [df_head]


    for i, v in enumerate(timelist[timelist == '0 h '].index):
        start = v


        if (i+1) >= len(timelist[timelist == '0 h '].index):

            chromatic = df.iloc[4:, start:]
            chromatic.insert(loc=0, column="Content", value=df.iloc[4:,1])
            chromatic.insert(loc=0, column="Well", value=df.iloc[4:,0])
            chromatic = chromatic.set_axis(np.arange(0,chromatic.shape[1],1), axis=1)
            print(chromatic)
            chromatics_list.append(chromatic)

        else:
            end = timelist[timelist == '0 h '].index[i+1]

            chromatic = df.iloc[4:, start:end]
            chromatic.insert(loc=0, column="Content", value=df.iloc[4:,1])
            chromatic.insert(loc=0, column="Well", value=df.iloc[4:,0])
            #add spacer rows
            chromatic = chromatic.append(pd.Series(), ignore_index=True)
            chromatic = chromatic.append(pd.Series(), ignore_index=True)


            chromatic = chromatic.set_axis(np.arange(0,chromatic.shape[1],1), axis=1)
            chromatics_list.append(chromatic)

    final = pd.concat(chromatics_list)

    final.reset_index(inplace=True, drop=True)

    mask = final.iloc[6,:] == end_time_string
    mask = mask.values

    twelve_hr_index = final.iloc[6,mask].index

    twelve_hr_index = twelve_hr_index[0]

    twelve_hr_index = twelve_hr_index + 1

    final = final.iloc[:,:twelve_hr_index]

    # making the new name by replacing raw with parsed_dataset
    new_name = f_name[:-3]
    new_name = new_name+'parsed_dataset.csv'
    print(new_name)

    #######################################################################

    print(os.getcwd())
    path = "/app/data/parsed_data_files/"
    # make directory for sticking the output in
    if os.path.isdir(path) == False:
        os.mkdir(path, mode=0o777)

    os.chdir(path)
    final.to_csv(new_name, header=False, index=False)


    #navigate home for neatness
    os.chdir('/src')

    
    
else:
    
    #print("I think the time format is '0 mins'")
    
    #make a python list from the series
    timelist = list(timelist)
    
    # drop nan and time
    timelist = timelist[2:]
    
    #remove the last four characters of every string
    timelist = [sub[:-5] for sub in timelist]
    
    #make them all numerical
    timelist = [int(sub) for sub in timelist]
    
    # get max time point
    max_time = max(timelist)
    
    # get index of max values
    res = [x for x, z in enumerate(timelist) if z == max_time] 
    
    # add 2 back to account for nan and time removed earlier  plus one for the indexing factor
    res = [x+3 for x in res]
    
    # we now have the indexes and are ready to slice
    
    # first off grab the well and sample names
    well_and_sample_names = df.iloc[5:,:2].copy()
    well_and_sample_names = well_and_sample_names.reset_index(drop=True)

    
    # iterate over the indexes
    for i, indexer in enumerate(res):
        
        # if it's the first one
        if i == 0:
            
            #intialise new df from the metadata before
            #add spacer rows
            reshuffled = df_head.copy()
            reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
            
            # grab the first one
            chromatic = df.iloc[5:,2:indexer].copy()
            chromatic = chromatic.reset_index(drop=True)
            
            # add the well and sample names
            chromatic = pd.concat([well_and_sample_names, chromatic], axis=1)
            chromatic = chromatic.reset_index(drop=True)
            
            # append to reshuffled
            reshuffled = reshuffled.append(chromatic, ignore_index=True)
            reshuffled = reshuffled.reset_index(drop=True)
            
            # save v for slicing the next bit
            last_indexer = indexer
            
            
        # if more than the first    
        else:
            
            # add the gap spacers
            for i in range(0,gap,1):
                print(i)
                reshuffled = reshuffled.append(pd.Series(), ignore_index=True)

            
            # grab the next one
            chromatic = df.iloc[5:, last_indexer: indexer].copy()
            chromatic = chromatic.reset_index(drop=True)
            
            
            # add the well and sample names
            chromatic = pd.concat([well_and_sample_names, chromatic], axis=1)
            chromatic = chromatic.reset_index(drop=True)
            
            # resets the column index
            chromatic = chromatic.T.reset_index().T.reset_index(drop=True)
            chromatic = chromatic.iloc[1:,:]
            
            #print(chromatic)
            
            # append to reshuffled
            reshuffled = reshuffled.append(chromatic, ignore_index=True)
            reshuffled = reshuffled.reset_index(drop=True)
            
            # save v for slicing the next bit
            last_indexer = indexer
        
    
    
    # carve off any overhang Nans
    
    reshuffled = reshuffled.iloc[:,:res[0]-1]
    
    
    # making the new name by replacing raw with parsed_dataset
    new_name = prefix + "_parsed_dataset.csv"
    
    #######################################################################

    print(os.getcwd())
    path = "/app/data/parsed_data_files/"
    # make directory for sticking the output in
    if os.path.isdir(path) == False:
        os.mkdir(path, mode=0o777)

    os.chdir(path)
    reshuffled.to_csv(new_name, header=False, index=False)


    #navigate home for neatness
    os.chdir('/app')

            
        

0
1
2
/app/data/raw_data_files


  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(chromatic, ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(pd.Series(), ignore_index=True)
  reshuffled = reshuffled.append(chromatic, ignore_index=True)


# Tidy Data Script

This section actually does the metadata annotation

In [9]:
# navigate into processed_data_files and grab the parsed dataset
os.chdir('/app/data/parsed_data_files')

In [10]:

def import_data(filename):
    """ 
    Imports the raw .csv
    Segregates the metadata out and stores it along with the actual data in a dictionary
    """
    
    data = pd.read_csv(filename, header=None)
   

    date = data.iloc[0,1]
    metadata = data.iloc[:3,:3]

    data = data.iloc[5:,:]
    data.reset_index(inplace=True)
    data = data.iloc[:,1:]
    

    keys = ['date', 'metadata','raw_data']
    values = [date, metadata, data]

    data_dict = {keys: values for keys, values in zip(keys, values)}
    return data_dict

In [11]:


def slice_dataframe_based_on_experimental_number(num_of_experiments, num_of_standards, raw_data):
    """"Slices data frame based on the experimental number."""
    """ Takes in the experimental and standards numbers as determined in the determine_experimental_data(), the gap number from metadata.json """

    """ Includes negative control in experimental number """

    """ Returns a python list of dataframes for each chromatic."""


    # initial empty list
    chromatic_list = []
    
    # first chromatic
    # The chromatic slice factor is effectively the number of rows in each chromatic.
    chromatic_slicefactor = num_of_experiments + metadataheader + num_of_standards  

    # just take the first chromatic at the top of the raw data DF
    chromatic_1_slice = raw_data.iloc[:chromatic_slicefactor,:]
    #append to list 
    chromatic_list.append(chromatic_1_slice)
    
    #iterate over the rest
    for i in range(1, number_of_chromatics,1):
        
        # move down!
        old_factor = chromatic_slicefactor + gap
        
        chromatic_slicefactor = chromatic_slicefactor + gap + num_of_experiments + metadataheader + num_of_standards
        chromatic_slice = raw_data.iloc[old_factor : chromatic_slicefactor,:]
        chromatic_list.append(chromatic_slice)
    
    return chromatic_list 



In [12]:

def determine_experimental_number():
    """
    Works out how many wells are used for actual experiments.
    Works by iterating through the well metadata dictionary and simply counting the standards and experiments.
    Returns the numbers for both.
    """
    
    num_of_experiments = 0
    num_of_standards = 0
    #go through the well_designation dict and if a reaction type is not a standards, plus 1 to num of experiments
    for well in well_designation.keys():
        if well_designation[well]['Reaction Type'] != 'Standard':
            num_of_experiments +=1
        else:
            num_of_standards +=1

    return num_of_experiments, num_of_standards


In [13]:
##########################################################################################

chromatic_name_list_for_saving = []

## get the dataset name
dataset_name = prefix + "_parsed_dataset.csv"

# import the data using the function defined at the top
raw_package_dict = import_data(dataset_name)

#get raw data from dictionary
raw_data = raw_package_dict['raw_data']

# extracting experimental details using the dict
num_of_experiments, num_of_standards = determine_experimental_number()
print(num_of_experiments)

# make the list
chromatic_list = slice_dataframe_based_on_experimental_number(num_of_experiments,num_of_standards, raw_data)



10


# Organise time and refactor into integer minutes

In [14]:
# get timelist to determine time format
minute_list = chromatic_list[0].iloc[1,:]

# get 0h 0min
first_timepoint = minute_list.loc[2]


# check to see which time format the data is in.
# if doesn't contain h
if(~first_timepoint.__contains__('h')):

    # convert to python list
    minute_list = list(minute_list)
    
    #delete first two, nan and time
    minute_list = minute_list[2:]
    
    # delete any nans
    minute_list =  [x for x in minute_list if  pd.isnull(x) == False]
    
    #remove the last four characters of every string
    minute_list = [sub[:-5] for sub in minute_list]
    
    #make them all numerical
    minute_list = [int(sub) for sub in minute_list]
    
    # add time and thing back
    minute_list.insert(0,'Time (Mins)')
    minute_list.insert(0,'Sample')
    
    # great
    print(minute_list)
    # now put 

else:
    #get time list
    time = list(raw_data.iloc[raw_data[raw_data.iloc[:,1]=='Time'].index[0],:])
    #delete inital nan

    if math.isnan(float(time[0])) == True:
        del time[0]
        del time[0]
        print('Time list is ready, deleted nan and time')
    elif time[0] == 'Time':
        del time[0]
        print('Time list is ready, deleted time')
    elif time[0] == '0 h ':
        print('Time list is ready')
    else:
        print('Time list is weird')


    #remove nans
    time = [x for x in time if str(x) != 'nan']
    
    # generate the minute list

    minute_list= []

    for i in time:
        #sets the first character of the string as a numeric
        a = int(str(i)[0])

        if len(i) == 4:
            # if the time is a single hour, e.g. '2 h', multiply the first character by 60 and return
            hour = int(str(i)[0]) * 60
            # addition
            total_time = hour
            # add to list
            minute_list.append(total_time)

        if len(i) == 5:
            # if the time is a single hour, e.g. '2 h', multiply the first character by 60 and return
            hour = int(str(i)[0:2]) * 60
            # addition
            total_time = hour
            # add to list
            minute_list.append(total_time)

        elif len(i) == 9:
            # multiply the first character by 60
            hour = int(str(i)[0]) * 60
            # save the fifth character
            minute = int(str(i)[4])
            # addition
            total_time = hour + minute
            # add to list
            minute_list.append(total_time)

        elif len(i) == 10:
            #if the first number is a less than 10 (1 not 11)
            if i[1].isspace():
                # multiply the first character by 60
                hour = int(str(i)[0]) * 60
                # save the fifth and sixth characters
                minute = int(str(i)[4:6])
                # addition
                total_time = hour + minute
                # add to list
                minute_list.append(total_time)
            else:
                # multiply the first character by 60
                hour = int(str(i)[0:2]) * 60
                # save the fifth and sixth characters
                minute = int(str(i)[5])
                # addition
                total_time = hour + minute
                # add to list
                minute_list.append(total_time)

        elif len(i) == 11:
            # multiply the first character by 60
            hour = int(str(i)[0:2]) * 60
            # save the fifth and sixth characters
            minute = int(str(i)[5:7])
            # addition
            total_time = hour + minute
            # add to list
            minute_list.append(total_time)


    minute_list.insert(0,'Time (Mins)')
    minute_list.insert(0,'Sample')

['Sample', 'Time (Mins)', 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266]


# Get the name of the chromatic to annotate with later

In [15]:

#store the chromatic names for saving the data
for i,v in enumerate(chromatic_list):

    
    #get the whole string contatining the chromatic name
    chromatic_name = chromatic_list[i].iloc[0,2]
    
    print(chromatic_name)
    
    # get index of the '(' character in the string'
    chromatic_name_index = chromatic_list[i].iloc[0,2].index('(')
    
    # trim the string to include all the characters after the ( 
    chromatic_name = chromatic_name[chromatic_name_index+1:]
    
    # get rid of the last character )
    chromatic_name = chromatic_name[:-1]
    
    chromatic_name = chromatic_name.replace(" ", "_")
    chromatic_name = chromatic_name.replace("/", "_")
    
    chromatic_name_list_for_saving.append(chromatic_name)

Raw Data (485/520 2)
Raw Data (485/520 1)


# Put the integer time list back on each chromatic df

In [16]:
def add_time(df, time_list):

    #check index correct
    df.reset_index(inplace=True)
    df = df.iloc[:,1:]
    
    #delete spectral and time row if necessary
    while df.iloc[0,1] != "Time":
        df = df.iloc[1:,:]
        df.reset_index(inplace=True)
        df = df.iloc[:,1:]

    #insert the minute lists
    df.iloc[0,:] = minute_list

    return df

for i,v in enumerate(chromatic_list):
    chromatic_list[i] = add_time(chromatic_list[i], minute_list)
    

In [17]:
def reorder(df):
    
    #get the row that is to be the columns
    column_row = df.iloc[df.loc[df.iloc[:,1]=='Time (Mins)',:].index[0]]
    
    #rename columns
    df.columns = column_row
    
    
    df = df.iloc[1:,:]
    
    # save sample and time for the melt
    keep_these = column_row[:2]
    #drop Sample and Time
    cols_time = column_row[2:]

    df = pd.melt(df,
            
            id_vars=keep_these,
            
            value_vars=cols_time,
            
            var_name='Time (Minutes)',
            value_name='RFUs'
           )
    
    # rename Sample to well and Time (Mins to Sample Id)
    df.rename(columns = {"Sample": "Well", "Time (Mins)":"Sample ID"}, inplace = True)
    
    ##replace in categories list
    #categories[categories.index('Time (Mins)')] = 'Time (Minutes)'
    #categories[categories.index('Sample')] = 'Sample ID'
    #categories = categories + ['Well']

    
    return df


# iterate over the chromatic list and...
for i,v in enumerate(chromatic_list):
        
        # call reorder() on each chromatic
        chromatic_list[i] = reorder(chromatic_list[i])
        
        #Add chromatics and gains
        print(i)
        print(chromatic_name_list_for_saving[i])
        print(chromatics_designations)
        
        
        # look up the relevent gain setting (1500 or whatever) in the chromatic designations metadata dictionary using the
        # appropriate key (e.g. 485_520_1)
        chromatic_list[i]['Gain Setting'] = chromatics_designations[chromatic_name_list_for_saving[i]]
        
        # save the key
        chromatic_list[i]['Chromatic Settings'] = chromatic_name_list_for_saving[i]
        
        # chop off the "_1" bit 
        chromatic_list[i]['Chromatic Settings'] = chromatic_list[i]['Chromatic Settings'].str.replace('_'+str(i+1), '')
        
        ##### in this bit we're going to use the chromatic setting (485_520) to determine what the expression product was and 
        # then annotating appropriately
        
        if chromatic_name_list_for_saving[i][:-2] == "485_520":
            
            chromatic_list[i]['Expression Product'] = "GFP"
        
        elif chromatic_name_list_for_saving[i][:-2] == "635_680":
            
            chromatic_list[i]['Expression Product'] = "Malachite Green"
            
        else:
            
            chromatic_list[i]['Expression Product'] = "Unknown"
               

0
485_520_2
{'485_520_2': '1500', '485_520_1': '800'}
1
485_520_1
{'485_520_2': '1500', '485_520_1': '800'}


# Time to compile the chromatic list DFs into a tidy data set

In [25]:
# Concatenate
tidy_data = pd.concat(chromatic_list, axis=0, sort=False)

# remove duplicated rows and or columns
tidy_data = tidy_data.loc[:,~tidy_data.columns.duplicated()]


# Adding the Well-Specific Metadata

Uses the well specific metadata to annotate each well.

In [19]:
def get_metadata_categories_from_well_designation(dictionary=well_designation):
    
    meta_data_arr = np.array([])

    for well in dictionary.keys():

        meta_data_arr = np.append(meta_data_arr, np.array(list(dictionary[well].keys())))

    meta_data_arr = np.unique(meta_data_arr)

    return meta_data_arr

meta_data_arr = get_metadata_categories_from_well_designation()


In [20]:
def assign_metadata_by_well(df, metadatacolumn):
    
    """takes the metadata given, 
    creates a column then iterates over the wells,
    looks up that well in the well_designation dictionary,
    if that well has the metadata given then it adds it to that well's entry in the new column,
    if not then it leaves it blank.
    returns the dataframe"""
    
    #creates blank column
    df[metadatacolumn] =''

    #iterates over the wells
    for well in df['Well'].unique():
        
        if metadatacolumn in well_designation[well]:
            
            df.loc[df['Well'] == well, metadatacolumn] = well_designation[well][metadatacolumn]
            
        else:
            
            df.loc[df['Well'] == well, metadatacolumn] = ''
    
    return df

for meta in meta_data_arr:
    tidy_data = assign_metadata_by_well(tidy_data, meta)
tidy_data

Unnamed: 0,Well,Sample ID,Time (Minutes),RFUs,Gain Setting,Chromatic Settings,Expression Product,Amplicon DNA Template,Energy Solution,Reaction Type,Replicate,System
0,B12,Sample X1,0,3323,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,1,Michael_Lysate_001
1,B13,Sample X2,0,3397,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,2,Michael_Lysate_001
2,B14,Sample X3,0,3221,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,3,Michael_Lysate_001
3,B15,Sample X4,0,3233,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,4,Michael_Lysate_001
4,B16,Sample X5,0,3220,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,5,Michael_Lysate_001
...,...,...,...,...,...,...,...,...,...,...,...,...
1335,B17,Sample X6,266,77,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,6,Michael_Lysate_001
1336,B18,Sample X7,266,78,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,7,Michael_Lysate_001
1337,B19,Sample X8,266,78,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,8,Michael_Lysate_001
1338,B20,Sample X9,266,97,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,9,Michael_Lysate_001


# Experiment-Wide MetaData

Since this is experiment-wide, we can just make a new column and set it for all.

In [26]:
for meta in metadata.keys():
    tidy_data[meta] = metadata[meta]
tidy_data

Unnamed: 0,Well,Sample ID,Time (Minutes),RFUs,Gain Setting,Chromatic Settings,Expression Product,Reaction Temperature (°C),Performed by,Instrument,...,lysate_aspirate_height,lysate_aspirate_height_inc,substrates_aspirate_height,substrates_aspirate_height_inc,wax_dispense_volume,wax_dispense_height,wax_new_tip,wax_touch_tip,wax_air_gap,wax_disposal_volume
0,B12,Sample X1,0,3323,1500,485_520_2,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
1,B13,Sample X2,0,3397,1500,485_520_2,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
2,B14,Sample X3,0,3221,1500,485_520_2,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
3,B15,Sample X4,0,3233,1500,485_520_2,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
4,B16,Sample X5,0,3220,1500,485_520_2,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,B17,Sample X6,266,77,800,485_520_1,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
1336,B18,Sample X7,266,78,800,485_520_1,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
1337,B19,Sample X8,266,78,800,485_520_1,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30
1338,B20,Sample X9,266,97,800,485_520_1,GFP,30,AP & MJS,BMG POLARstar Omega,...,4.5,0.4,9,0.4,35,5,never,True,20,30


In [22]:
#print(tidy_data)
tidy_data['Assay Date'] = raw_package_dict['metadata'].iloc[1,1][6:]
tidy_data['Assay Date'] = pd.to_datetime(tidy_data['Assay Date'])
tidy_data

Unnamed: 0,Well,Sample ID,Time (Minutes),RFUs,Gain Setting,Chromatic Settings,Expression Product,Amplicon DNA Template,Energy Solution,Reaction Type,...,lysate_aspirate_height_inc,substrates_aspirate_height,substrates_aspirate_height_inc,wax_dispense_volume,wax_dispense_height,wax_new_tip,wax_touch_tip,wax_air_gap,wax_disposal_volume,Assay Date
0,B12,Sample X1,0,3323,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
1,B13,Sample X2,0,3397,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
2,B14,Sample X3,0,3221,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
3,B15,Sample X4,0,3233,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
4,B16,Sample X5,0,3220,1500,485_520_2,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,B17,Sample X6,266,77,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
1336,B18,Sample X7,266,78,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
1337,B19,Sample X8,266,78,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06
1338,B20,Sample X9,266,97,800,485_520_1,GFP,s70_deGFP_MGA,Michael_ES_001,TXTL,...,0.4,9,0.4,35,5,never,True,20,30,2022-01-06


# Save to CSV

In [23]:

path = "/app/data/tidy_data_files/"

# make directory for sticking the output in
if os.path.isdir(path) == False:
    os.mkdir(path, mode=0o777)
    
    
#navigate to tidy_data_files
os.chdir(path)

tidy_data.to_csv(prefix+"_tidy_data.csv", header=True, index=False)


#navigate home for neatness
os.chdir('/app')