# Parse Results

Using this right now to iterate through an experiment's directory and parse all the annotation result files
and append them into one csv file per experiment.

### Roadmap:
1. Parse a **manual** results file and generate a dataframe from it
2. Iterate an experiment directory, parse each **manual** result file and append to an experiment dataframe
3. **DETOUR** Fix how we create and populate **automated anotations** for easier storage and reading
4. Add in functionality to read in **automated annotations**
5. Add in functionality to populate Google Sheets with results with **PROPER CHECKING FOR DUPLICATES**
6. Get away from Google Sheets and use a proper database and visualization solution

In [1]:
import os
import pandas as pd

In [73]:
annotations_dir = "/Volumes/the_box/CURRENT ANNOTATIONS"

single_cell_dir = "10-22 Jurkat TNFa varying on rates"
triple_cell_dir = "10-21 varying on JTIC16"

experiment_dir = f"{annotations_dir}/{triple_cell_dir}"

test_lane = "LN1_2"
current_dir = f"{experiment_dir}/{test_lane}"

In [178]:
def generate_exp_results_file(experiment_dir, save=True):
    exp_results_file_name = "exp_results.csv"

    # NOTE: removed sum/aggregate column so we can just have a raw data db and then do analyses on a separate sheet
    # or something
    db_cols = ["Date", "Name", "Lane", "Lag Binder", "ICD", "Cell", "Counts"]

    print(f"Testing on {experiment_dir}")
    results_path = f"{experiment_dir}/{exp_results_file_name}"
    if not os.path.exists(results_path):
        print("Creating empty results df")
        print("***")
        exp_df = pd.DataFrame(columns=db_cols)
        lane_dirs = [f"{experiment_dir}/{file}" for file in os.listdir(experiment_dir) if os.path.isdir(f"{experiment_dir}/{file}")]

        for lane in lane_dirs:
            print(f"Scanning: {lane}")
            lane_rows = generate_db_rows(lane)

            # No results file found
            if lane_rows is None:
                continue
            else:
                print(f"Adding: {len(lane_rows)} rows to db...")
                for row in lane_rows:
                    #print(row)
                    # This needs to be transposed to be a row
                    row_df = pd.DataFrame(row).T
                    # Column names need to be added in after transpose i think...
                    row_df.columns = db_cols
                    #print(row_df)
                    exp_df = exp_df.append(row_df)
            print("---")
            
        if save:
            exp_df.to_csv(f"{experiment_dir}/{exp_results_file_name}")
    else:
        print("Reading exisiting results file")
        exp_df = pd.read_csv(results_path)
    return exp_df

In [180]:
exp_dirs = [e for e in os.listdir(annotations_dir) if os.path.isdir(f"{annotations_dir}/{e}")]
exp_results 
#generate_exp_results_file(experiment_dir,save=True)

10-28 ICAM lag series ON
10-22 Jurkat TNFa varying on rates
10-21 varying on JTIC16
09-03 JTIC16 varying on rates


In [177]:
exp_df

Unnamed: 0,Date,Name,Lane,Lag Binder,ICD,Cell,Counts
0,10-21,varying on JTIC16,LN_6,none,Jurkat,Jurkat,"[0, 0, 0]"
0,10-21,varying on JTIC16,LN_6,Lag16,PSGL1 Tether,Lag16 PSGL1 Tether,"[2, 7, 5]"
0,10-21,varying on JTIC16,LN_6,Lag16,ICAM,Lag16 ICAM,"[4, 13, 11]"
0,10-21,varying on JTIC16,LN_2,none,Jurkat,Jurkat,"[0, 0, 0]"
0,10-21,varying on JTIC16,LN_2,error,PSGL1 Tether,error PSGL1 Tether,"[8, 10, 9]"
0,10-21,varying on JTIC16,LN_2,L16,ICAM,L16 ICAM,"[32, 22, 32]"
0,10-21,varying on JTIC16,LN_5,none,Jurkat,Jurkat,"[0, 0, 0]"
0,10-21,varying on JTIC16,LN_5,Lag16,PSGL1 Tether,Lag16 PSGL1 Tether,"[3, 4, 5]"
0,10-21,varying on JTIC16,LN_5,Lag16,ICAM,Lag16 ICAM,"[12, 15, 22]"
0,10-21,varying on JTIC16,LN_4,none,Jurkat,Jurkat,"[0, 0, 0]"


In [98]:
# ENTRY POINT
# Need to output csv rows of the form
# Date Name Lane LagBinder ICD CellName CountsList Sum
# 10-21 varying on rates L16 ICAM  L16 ICAM  32 22 32 86
def generate_db_rows(lane_dir):
    info = get_info(lane_dir)
    results = read_results(lane_dir)

    # Results file was not found
    if results is None:
        print(f"WARNING!: Results file not found for {lane_dir}")
        return None
        
    cells = list(results.columns[:-2])

    # Need to figure out how to handle cell counts as either a list or a string to fit into one cell
    db_rows = [info + expand_cell_name(cell) + [get_cell_pos_counts(cell,results)] for cell in cells]
    return db_rows

In [101]:
# Using this to populate database row entries
# Need to read in notes file to get other info maybe?
def get_info(lane_dir):
    tokens = lane_dir.split("/")
    lane = tokens[-1]
    lane_prefix, lane_number = lane.split("_")
    std_lane_prefix = "LN"
    corrected_lane = f"{std_lane_prefix}_{lane_number}"

    date_name = tokens[-2]
    
    date_name_tokens = date_name.split(" ")
    date = date_name_tokens[0]
    name = " ".join(date_name_tokens[1:])
    #print(f"Date: {date} Name: {name} Lane: {std_lane_prefix}{lane_number}")
    info = [date,name,corrected_lane]
    return info

In [99]:
# Reads ONE LANE iterate outside
# This currently breaks ungracefully when results file not found, handle error one layer up for now?
def read_results(lane_dir):
    manual_dir = "manual/results"
    results_dir = f"{lane_dir}/{manual_dir}"
    # Does changing dir revert back function ends? Should use that instead of specifying full path each time
    print(f"Reading from: {results_dir}")
    
    results_candidates = [f for f in os.listdir(results_dir) if f.endswith(".csv")]
    print(results_candidates)
    if len(results_candidates) == 0:
        print("No results file found")
        return None
    
    # Not very elegant but should only be one file
    results_file = results_candidates[0]
    print(f"Found result file: {results_file}")
    results = pd.read_csv(f"{results_dir}/{results_file}")
    coi = get_columns_of_interest(results)
    parsed = parse_results(results,coi)
    return parsed

# This reads the Cell Counter results table and gets the column names we are interested in
def get_columns_of_interest(results):
    r_cols = list(results.columns)
    cpos_index = r_cols.index("C-pos")
    channel = r_cols[cpos_index - 1]
    cells = r_cols[1:cpos_index-1]
    position = r_cols[cpos_index+1]
    cols_of_interest = cells + [channel,position]
    return cols_of_interest

# This uses the columns of interest and gets the results while renaming them
def parse_results(results, coi):
    coi = get_columns_of_interest(results)
    
    filtered = pd.DataFrame(results[coi])
    ch_col_old_name = list(filtered.columns)[-2]
    col_swaps = {ch_col_old_name: "Channel", "Z-pos": "Position"}
    parsed = filtered.rename(columns=col_swaps)
    return parsed

In [172]:
# This reads the cleaned results table and extracts cell counts while accounting for annotation erros
def get_cell_pos_counts(cell_name, results):
    cell = results[[cell_name,"Position"]]
    cell_positions = cell.groupby("Position")
    cell_pos_counts = [sum(pos[1][cell_name]) for pos in cell_positions]
    return cell_pos_counts

def expand_cell_name(cell):
    print("vvvvvv")
    l_binders = ["L"+ str(num) for num in [16,17,42,18]]
    lag_binders = ["Lag"+ str(num) for num in [16,17,42,18]]
    
    # Inconsistent annotation is me giving me headaches
    lag_binders = l_binders + lag_binders
    icds = ["ICAM", "PSGL1 Tether", "ICAM Tether"]
    
    cell_tokens = cell.split(" ")
    print(cell_tokens)
    
    # Initializing, probably better way to do this
    binder = icd = name = ""
    
    # Just to accomodate some annotation names
    jurkats = ["Jurkat", "Jurkats"]
    # Jurkat case
    for j in jurkats:
        if j in cell_tokens:
            #print("Jurkat found")
            binder = "none"
            icd = "Jurkat"
            name = "Jurkat"
            return [binder,icd,name]
        
    # Other cells
    for t in cell_tokens:
        print(f"Checking if {t} is a lag binder...")
        if t in lag_binders:
            print("...yes")
            binder = t

            # Trim out binder to parse ICD
            cell_tokens.remove(t)
            break
        else:
            # This shouldn't happen
            binder = "error"
    
    print("---")
    
    # Checking for our ICDs
    print(cell_tokens)
    icd_candidate = " ".join(cell_tokens)
    print(f"Checking if {icd_candidate} is an ICD...")
    if icd_candidate in icds:
        print("...yes")
        icd = icd_candidate
    else:
        # This should never happen
        icd = "error?"
                
    # Rebuilding name in standard order        
    name = f"{binder} {icd}"
        
    #print(f"Lag Binder: {binder} ICD: {icd} Name: {name}")
    return [binder,icd,name]

In [60]:
rows = generate_db_rows(current_dir)
for r in rows:
    print(r)

Date: 10-21 Name: varying on JTIC16 Lane: LN2
Reading from: /Volumes/the_box/CURRENT ANNOTATIONS//10-21 varying on JTIC16/LN1_2/manual/results
Found
['10-21', 'varying on JTIC16', 'LN_2', 'none', 'Jurkat', 'Jurkat', [0, 0, 0]]
['10-21', 'varying on JTIC16', 'LN_2', 'error', 'PSGL1 Tether', 'error PSGL1 Tether', [8, 10, 9]]
['10-21', 'varying on JTIC16', 'LN_2', 'L16', 'ICAM', 'L16 ICAM', [32, 22, 32]]
