# Crosscheck health statuses 
- continuation of BleachedSamples.ipynb
- crosscheck sample health statuses with colony conditions
- right now am looking for mismatches in health statuses (8/25/2025) to manually investigate and fix discrepencies
- next: want to make sure each colony has the correct num of samples per sample date

In [1]:
import numpy as np
import pandas as pd
import os
import re

In [2]:
# upload sample and colony data 
sample=pd.read_csv('~/Documents/sctld/SCTLD_samples/Sample_Data/CBC_samples.csv')
colony=pd.read_csv('~/Documents/sctld/SCTLD_samples/Sample_Data/CBC_ColonyData.csv')
colony.drop(columns="Unnamed: 0",inplace=True)

In [3]:
# add colony ID - t# newtagnum species
sample = sample.copy()
sample['TransectNum_str'] = 'T' + sample['TransectNum'].astype(str)
sample['colony_id'] = sample[['TransectNum_str', 'NewTagNum', 'Species']].astype(str).agg('_'.join, axis=1)
sample.drop(columns='TransectNum_str', inplace=True)

In [4]:
# add colony ID - t# newtagnum species
colony = colony.copy()
colony['TransectNum_str'] = 'T' + colony['TransectNum'].astype(str)
colony['colony_id'] = colony[['TransectNum_str', 'NewTagNum', 'Species']].astype(str).agg('_'.join, axis=1)
colony.drop(columns='TransectNum_str', inplace=True)

In [5]:
# filter for UML samples for crosschecking
sample_subset=sample[
    (sample['Sample_type'] == 'Core_EtOH') |
    (sample['Sample_type'] == 'Core_RNAlater')
]

# make species list 
species_list=sample['Species'].unique().tolist()

# extract condition columns 
cond_cols=['colony_id'] + colony.columns[colony.columns.str.contains('_Condition')].tolist()

In [6]:
# make merged df for everything

# filter colony data for conditions 
small_colony=colony.loc[:,cond_cols]

# and pivot
conditions_long=small_colony.melt(id_vars='colony_id', 
                          var_name='Month_year', 
                          value_name='colony_condition')

# extract month_year from column names to match sample data format
conditions_long['Month_year'] = conditions_long['Month_year'].str.extract(r'(\d+)').astype(int)
# add back in colony data 
cols_keep=['colony_id','Date_InitialTag','Transect','Species','immune_y/n','Date_DocumentedMortality']
colony_filtered=conditions_long.merge(colony[cols_keep],
                                      on='colony_id', 
                                      how='left')

# filter sample data (USING SUBSET FOR NOW) 
filtered_samples=sample_subset.loc[:,('colony_id','Month_year','Health_status','Sampling_notes','Tubelabel_species')] 
# rename column for sample health statuses 
filtered_samples = filtered_samples.rename(columns={'Health_status': 'sample_condition'})

# merge colony and sample dfs by colony id and monthyear
merged = pd.merge(
    colony_filtered,
    filtered_samples,
    on=['colony_id', 'Month_year'],
    how='outer'
)

# remove rows with no sample 
merged=merged[merged['Tubelabel_species'].notna()]

In [35]:
merged['sample_condition'].unique()

array(['Diseased_Margin', 'Diseased_Tissue', 'Healthy', 'Bleached_Tissue',
       'unknown', 'Unknown'], dtype=object)

In [39]:
print(merged['colony_condition'].unique())
# fix spaces 
merged.loc[merged['colony_condition']=="Healthy ",'colony_condition']="Healthy"

['Diseased' 'Healthy' nan 'CLP' 'Diseased, CLP' 'Diseased, CLB' 'CLB,CLP'
 'DC' 'CLB' 'CLP,CLB' 'Diseased_Other' 'CLB,DC' 'Diseased_Other, CLP'
 'CLP,DC' 'Not_Visited' 'Not_visited']


In [40]:
# create function to crosscheck colony health with samples 

def status_match(group):
    # get the condition and list sample statuses of each colony at each monthyear (by row) 
    cond = group['colony_condition'].iloc[0]
    if pd.isna(cond):
        return all(pd.isna(s) for s in group['sample_condition'])
    # turned into string to prevent crashes from NAs
    statuses = group['sample_condition'].dropna().tolist()
    month = group['Month_year'].iloc[0]
    immune_dates = [112023, 122023, 12024, 22024, 62024, 82024, 122024] 

    # match colony conditions to sample conditions 
    # if colony_condition is healthy, sample = healthy 
    if cond == 'Healthy':
        return all(s == 'Healthy' for s in statuses)

    # if condition is diseased healthy = diseased_margin and diseased_tissue (but rn these are on 2 diff rows)
    elif cond in ['Diseased','Diseased_Other']:
        return all(x in statuses for x in ['Diseased_Tissue', 'Diseased_Margin'])

    # if condition is clp | clb = bleached_tissue 
    elif any(term in cond for term in ['CLP','CLB']):
        return all(x in statuses for x in ['Bleached_Tissue'])
    
    # if condition is Not_visited, sample = NaN
    # if condition is Dead, sample = NaN 
    elif cond in ['Dead', 'Not_Visited','Not_visited','not_visited','NAN']:
        return all(pd.isna(s) for s in group['sample_condition'])

    # if condition is na, sample should also be na 
    elif pd.isna(cond):
        return all(pd.isna(s) for s in group['sample_condition'])

    # exclude False matches if the colony is non-immune during an immune sampling point 
    # list of immune dates 
    
    # if coral is NOT immune during an immune date 
    elif group['immune_y/n'].iloc[0]=='n' and month in immune_dates:
        # it is ok if the sample status is na 
        return all(pd.isna(s) for s in group['sample_condition'])

    ## UGH but this has to exclude T5 and T6 during 01/2024 and 042024 respectively because they were newly tagged then 

    # any mismatches return as false 
    else:
        return False

In [41]:
# crosscheck colony and sample conditions

checks = []
    # check each 'group' (unique combos of colony and monthyear) and store results in 'checks' list
for (colony_id, month), group in merged.groupby(['colony_id', 'Month_year']):
    checks.append({
        'colony_id': colony_id,
        'Species': group['Species'].iloc[0],
        'immune_y/n': group['immune_y/n'].iloc[0],
        'Date_InitialTag': group['Date_InitialTag'].iloc[0],
        'Month_year': month,
            
        # run function &
        # create col containing results of the function that matches statuses
        'Match': status_match(group),
            
        'colony_condition': group['colony_condition'].iloc[0],
        'sample_condition': group['sample_condition'].tolist(),
        'sample_ids': group['Tubelabel_species'].tolist(),
        'mortality_date' : group['Date_DocumentedMortality'].iloc[0]
})
# convert to dataframe
checks_df=pd.DataFrame(checks)

In [42]:
# check mismatches 

false_matches = {}
for specie in species_list:
    # make individual dfs per sp 
    df = checks_df[checks_df['Species']==specie]
    df.head(2)

    # exclude 122024 since no samples were taken 
    df = df[df['Month_year'] != 122024]
    
    # filter for where the match is false or if the colony condition is na and create a new df 
    false = df[(df['Match'] == False) & (~df['colony_condition'].isna())]
    
    # store false matches in new dictionary with species name as key 
    false_matches[specie] = false

In [43]:
species_list

['OFAV',
 'SSID',
 'PAST',
 'PSTR',
 'MCAV',
 'OANN',
 'DLAB',
 'CNAT',
 'MMEA',
 'Unknown']

In [46]:
false_matches['OFAV']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
392,T2_80_OFAV,OFAV,n,5/24/22,92023,False,"CLP,CLB","[Diseased_Margin, Diseased_Tissue]","[092023_BEL_CBC_T2_191_OFAV, 092023_BEL_CBC_T2...",Diseased
696,T4_77_OFAV,OFAV,n,12/5/22,92023,False,Healthy,"[Diseased_Tissue, Diseased_Margin]","[092023_BEL_CBC_T4_90_OFAV, 092023_BEL_CBC_T4_...",Healthy


In [47]:
false_matches['SSID']
# why do we have some that say not visited but we have samples for..

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
3,T1_11_SSID,SSID,n,6/24/19,42024,False,Diseased,[Diseased_Margin],[042024_BEL_CBC_T1_936_SSID],Diseased
7,T1_11_SSID,SSID,n,6/24/19,92023,False,Diseased,[Diseased_Margin],[092023_BEL_CBC_T1_193_SSID],Diseased
34,T1_17_SSID,SSID,n,6/24/19,42024,False,"Diseased, CLP","[Diseased_Tissue, Diseased_Margin]","[042024_BEL_CBC_T1_940_SSID, 042024_BEL_CBC_T1...",Diseased
38,T1_17_SSID,SSID,n,6/24/19,92023,False,"Diseased, CLB","[Diseased_Tissue, Diseased_Margin]","[092023_BEL_CBC_T1_196_SSID, 092023_BEL_CBC_T1...",Diseased
43,T1_1_SSID,SSID,n,6/21/19,42024,False,Diseased,[Diseased_Margin],[042024_BEL_CBC_T1_937_SSID],Diseased
47,T1_1_SSID,SSID,n,6/21/19,92023,False,Diseased,[Diseased_Margin],[092023_BEL_CBC_T1_185_SSID],Diseased
121,T1_3_SSID,SSID,y,6/21/19,22024,False,Diseased,[Healthy],[022024_BEL_CBC_T1_773_SSID],Diseased
123,T1_3_SSID,SSID,y,6/21/19,52022,False,Diseased,[Diseased_Tissue],[052022_BEL_CBC_T1_2_SSID],Diseased
127,T1_3_SSID,SSID,y,6/21/19,82024,False,Diseased,[Healthy],[082024_BEL_CBC_T1_1480_SSID],Diseased
128,T1_3_SSID,SSID,y,6/21/19,92023,False,Diseased,[Healthy],[092023_BEL_CBC_T1_173_SSID],Diseased


In [48]:
false_matches['PAST']
# no condition mismatch, just missing margin sample? 

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
40,T1_19_PAST,PAST,n,6/21/19,52022,False,Diseased,[Diseased_Tissue],[052022_BEL_CBC_T1_52_PAST],4/1/24


In [49]:
false_matches['PSTR']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
207,T2_54_PSTR,PSTR,n,10/13/19,92023,False,"Diseased, CLP","[Diseased_Margin, Diseased_Tissue]","[092023_BEL_CBC_T2_185_PSTR, 092023_BEL_CBC_T2...",Healthy
641,T3_75_PSTR,PSTR,n,12/3/22,42024,False,Healthy,"[Diseased_Tissue, Diseased_Margin]","[042024_BEL_CBC_T3_965_PSTR, 042024_BEL_CBC_T3...",Healthy


In [50]:
false_matches['MCAV']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
153,T1_8_MCAV,MCAV,n,6/24/19,122022,False,Diseased,[Diseased_Tissue],[122022_BEL_CBC_T1_144_MCAV],9/25/23
492,T3_2_MCAV,MCAV,y,6/23/19,12024,False,"Diseased_Other, CLP",[Healthy],[012024_BEL_CBC_T3_633_MCAV],Diseased
494,T3_2_MCAV,MCAV,y,6/23/19,42024,False,Diseased_Other,[Healthy],[042024_BEL_CBC_T3_956_MCAV],Diseased
499,T3_2_MCAV,MCAV,y,6/23/19,82024,False,Diseased_Other,[Healthy],[082024_BEL_CBC_T3_1553_MCAV],Diseased
603,T3_67_MCAV,MCAV,n,12/3/22,122022,False,Diseased,[Diseased_Tissue],[122022_BEL_CBC_T3_145_MCAV],9/25/23
631,T3_71_MCAV,MCAV,n,12/3/22,122022,False,DC,[Healthy],[122022_BEL_CBC_T3_155_MCAV],Healthy
817,T5_57_MCAV,MCAV,n,1/13/24,12024,False,DC,[Healthy],[012024_BEL_CBC_T5_700_MCAV],Healthy
818,T5_57_MCAV,MCAV,n,1/13/24,42024,False,DC,[Healthy],[042024_BEL_CBC_T5_1099_MCAV],Healthy
830,T5_64_MCAV,MCAV,n,1/13/24,42024,False,Not_visited,[Healthy],[042024_BEL_CBC_T5_1135_MCAV],Healthy
832,T5_65_MCAV,MCAV,n,1/13/24,12024,False,DC,[Healthy],[012024_BEL_CBC_T5_708_MCAV],Healthy


In [51]:
false_matches['OANN']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
61,T1_22_OANN,OANN,n,5/21/22,42024,False,CLP,[Diseased_Tissue],[042024_BEL_CBC_T1_1146_OANN],Diseased
67,T1_23_OANN,OANN,n,5/21/22,52022,False,DC,[Healthy],[052022_BEL_CBC_T1_35_OANN],Healthy
69,T1_23_OANN,OANN,n,5/21/22,92023,False,CLB,"[Diseased_Tissue, Diseased_Margin]","[092023_BEL_CBC_T1_186_OANN, 092023_BEL_CBC_T1...",Healthy
170,T2_30_OANN,OANN,n,12/5/22,92023,False,"CLP,CLB","[Diseased_Margin, Diseased_Tissue]","[092023_BEL_CBC_T2_193_OANN, 092023_BEL_CBC_T2...",Diseased


In [52]:
false_matches['DLAB']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
118,T1_35_DLAB,DLAB,n,5/25/22,122022,False,DC,[Healthy],[122022_BEL_CBC_T1_154_DL],Healthy
725,T4_93_DLAB,DLAB,n,12/5/22,92023,False,Healthy,"[Diseased_Tissue, Diseased_Margin]","[092023_BEL_CBC_T4_82_DLAB, 092023_BEL_CBC_T4_...",Healthy


In [53]:
false_matches['CNAT']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date


In [54]:
false_matches['MMEA']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date


In [55]:
false_matches['Unknown']

Unnamed: 0,colony_id,Species,immune_y/n,Date_InitialTag,Month_year,Match,colony_condition,sample_condition,sample_ids,mortality_date
