# Data Check 

## qa/qc to double check data after sampling trips 
- Do sample health statuses match colony conditions? 
- Check on 'dead' corals: did they stay dead (if not, change condition bc they were never dead),were they checked on, were any samples taken (possibly from the wrong coral?)
- Did we take samples from all live corals - 2 per coral, 3 on immune trips 

- Summary stats: 
    - Compare # of expected samples to how many samples were taken 
    - Num of corals at each transect (how does this compare to the previous trip?)
    - Num of corals within each condition 

In [301]:
import numpy as np
import pandas as pd
import os

In [302]:
os.getcwd()

'/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/CBC_metagenomics/Demographics'

In [303]:
# upload sample data 
sample_data=pd.read_csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/metadata/CBC_samples.csv',index_col="Tubelabel_species")

In [304]:
# convert dates to str
sample_data['Month_year'] = sample_data['Month_year'].astype(str)
# remove the decimal point
sample_data['Month_year'] = sample_data['Month_year'].str.replace('.0', '')
# add leading zeros where necessary
sample_data['Month_year'] = sample_data['Month_year'].str.pad(width=6, side='left', fillchar='0')
sample_data['Month_year']

# Extract month and year to make separate columns 
sample_data.loc[:,'Month'] = sample_data.loc[:,'Month_year'].str[0:2]
sample_data.loc[:,'Year'] = sample_data.loc[:,'Month_year'].str[2:]

In [305]:
sample_data.head()

Unnamed: 0_level_0,Month_year,Country,Location,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Time_sampled,...,Sample_type,SampleNum,Health_status,Sampling_notes,Sample_physical_location,Extraction_physical_location,Date_sequenced,Notes,Month,Year
Tubelabel_species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
122022_BEL_CBC_T4_18_OFAV,122022,BEL,CBC,12/5/22,CURLEW,4,,77,OFAV,,...,Core_frozen,18,Healthy,,,,,,12,2022
092023_BEL_CBC_T1_171_PAST,92023,BEL,CBC,9/25/23,CBC30N,1,,2,PAST,,...,Core_RNAlater,171,Healthy,CLP 90%,UML_NARWHAL_R1_B10,,,,9,2023
092023_BEL_CBC_T1_172_PAST,92023,BEL,CBC,9/25/23,CBC30N,1,,21,PAST,,...,Core_RNAlater,172,Healthy,No CL,UML_NARWHAL_R1_B10,,,,9,2023
092023_BEL_CBC_T1_173_SSID,92023,BEL,CBC,9/25/23,CBC30N,1,,3,SSID,,...,Core_RNAlater,173,Healthy,CLP 80%; DC 20%,UML_NARWHAL_R1_B10,,,,9,2023
092023_BEL_CBC_T1_174_MCAV,92023,BEL,CBC,9/25/23,CBC30N,1,,24,MCAV,,...,Core_RNAlater,174,Healthy,CLP 10%,UML_NARWHAL_R1_B10,,,,9,2023


In [306]:
# upload colony data 
colony_data=pd.read_csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/metadata/CBC_ColonyData.csv')

In [307]:
# convert integers to strings 
colony_data['NewTagNum'] = colony_data['NewTagNum'].astype(str)
colony_data['TransectNum'] = colony_data['TransectNum'].astype(str)
sample_data['NewTagNum'] = sample_data['NewTagNum'].astype(str)
sample_data['TransectNum'] = sample_data['TransectNum'].astype(str)

In [308]:
# drop 'unnamed' col
colony_data = colony_data.drop(columns=['Unnamed: 0'])

In [309]:
# fix caps issues 
colony_data['062024_Condition'].unique()

array(['Not_Visited', 'Healthy', 'Diseased', 'Dead', 'DC',
       'Diseased_Other', 'Not_visited', 'not_visited'], dtype=object)

In [310]:
# fix caps issues 
colony_data = colony_data.applymap(lambda x: 'Not_Visited' if isinstance(x, str) and x.lower() == 'not_visited' else x)
colony_data['062024_Condition'].unique()

  colony_data = colony_data.applymap(lambda x: 'Not_Visited' if isinstance(x, str) and x.lower() == 'not_visited' else x)


array(['Not_Visited', 'Healthy', 'Diseased', 'Dead', 'DC',
       'Diseased_Other'], dtype=object)

In [311]:
# add tag id to both sample and colony data to match colonies
colony_data['ID'] = colony_data['Species'] + '_T' + colony_data['TransectNum'] + '_' + colony_data['NewTagNum']
sample_data['ID'] = sample_data['Species'] + '_T' + sample_data['TransectNum'] + '_' + sample_data['NewTagNum']

In [312]:
colony_data['ID']

0       SSID_T1_1
1       PAST_T1_2
2       SSID_T1_3
3       PSTR_T1_4
4       SSID_T1_5
          ...    
214    OFAV_T6_32
215    OANN_T6_33
216    OFAV_T6_34
217    OFAV_T6_35
218    OANN_T6_36
Name: ID, Length: 219, dtype: object

In [313]:
sample_data['ID']

Tubelabel_species
122022_BEL_CBC_T4_18_OFAV     OFAV_T4_77
092023_BEL_CBC_T1_171_PAST     PAST_T1_2
092023_BEL_CBC_T1_172_PAST    PAST_T1_21
092023_BEL_CBC_T1_173_SSID     SSID_T1_3
092023_BEL_CBC_T1_174_MCAV    MCAV_T1_24
                                 ...    
62024_BEL_CBC_T4_1461_OFAV    OFAV_T4_78
NaN                           OFAV_T4_78
NaN                           MCAV_T4_76
62024_BEL_CBC_T4_1464_MCAV    MCAV_T4_76
NaN                           MCAV_T4_76
Name: ID, Length: 2103, dtype: object

In [314]:
# make 'ID' the index for sample and colony data 
sample_data.set_index('ID',inplace=True)
colony_data.set_index('ID',inplace=True)

In [315]:
#drop any accidental samples
sample_data = sample_data[sample_data['NewTagNum'] != "AS"]

In [316]:
sample_data['Sample_type'].unique()

array(['Core_frozen', 'Core_RNAlater', 'Core_EtOH', 'Probiotics',
       'Immune', 'TEM', nan, 'Syringe'], dtype=object)

In [317]:
# filter sample data to just get one sample from each date
uml_sample = sample_data[
    (sample_data['Sample_type'] == "Core_EtOH") | 
    (sample_data['Sample_type'] == "Core_RNAlater")
]


In [318]:
# view na
columns=["Month_year","Sample_type","Health_status"]
uml_sample.loc[uml_sample['Health_status'].isna(),columns]

Unnamed: 0_level_0,Month_year,Sample_type,Health_status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [319]:
# change all disease entries to diseased for easy checking 
diseased_rows=uml_sample['Health_status'].str.contains("Diseased")
diseased_rows
uml_sample.loc[diseased_rows, 'Health_status']="Diseased"

In [321]:
# change sample 102019 sample month years to 062019 since they are all grouped together in the condition data
uml_sample.loc[uml_sample['Month_year']=="102019","Month_year"]="062019"

In [322]:
# create for loop, cycle thru each data and see if sample health statuses match colony conditions 
# filter for etoh and rna later or will it work without??
# get unique values for month year 
dates=uml_sample['Month_year'].unique().tolist()
dates

['092023',
 '122022',
 '052022',
 '042024',
 '062019',
 '112023',
 '122023',
 '012024',
 '022024',
 '062024']

In [323]:
# replace NAs with Not_Visited
condition_cols=colony_data.columns[colony_data.columns.str.contains("Condition")]
colony_data[condition_cols] = colony_data[condition_cols].fillna("Not_Visited")

In [324]:
# check unique values in conditions
for date in dates:
    print(date,colony_data[f'{date}_Condition'].unique())
colony_data[condition_cols].describe()

092023 ['Diseased' 'CLP,CLB' 'Not_Visited' 'Dead' 'CLP' 'Healthy' 'Diseased, CLB'
 'CLB' 'Diseased, CLP' 'CLP,DC' 'DC']
122022 ['Diseased' 'Healthy' 'Dead' 'DC' 'Not_Visited' 'CLB,DC' 'CLB']
052022 ['Diseased' 'Healthy' 'DC' 'Dead' 'Not_Visited' 'CLB,DC']
042024 ['Diseased' 'Not_Visited' 'Healthy' 'Dead' 'Diseased, CLP' 'CLP'
 'Diseased_Other' 'DC' 'Diseased, DC' 'Diseased,DC']
062019 ['Healthy' 'Not_Visited']
112023 ['Not_Visited' 'CLB' 'CLP,CLB' 'CLP' 'Dead' 'Healthy' 'Diseased']
122023 ['Not_Visited' 'CLB' 'Healthy' 'Dead' 'CLP, DC' 'CLP' 'CLP,CLB' 'Diseased']
012024 ['Diseased' 'CLP,CLB' 'Healthy' 'Not_Visited' 'Dead' 'CLB' 'CLP' 'CLP,DC'
 'Diseased_Other' 'DC']
022024 ['Not_Visited' 'Healthy' 'DC' 'Dead' 'Diseased' 'CLB' 'CLP'
 'Diseased_Other']
062024 ['Not_Visited' 'Healthy' 'Diseased' 'Dead' 'DC' 'Diseased_Other']


Unnamed: 0,062019_Condition,052022_Condition,122022_Condition,092023_Condition,112023_Condition,122023_Condition,012024_Condition,022024_Condition,042024_Condition,062024_Condition
count,219,219,219,219,219,219,219,219,219,219
unique,2,6,7,11,7,8,10,8,10,6
top,Not_Visited,Not_Visited,Not_Visited,Not_Visited,Not_Visited,Not_Visited,Not_Visited,Not_Visited,Healthy,Not_Visited
freq,113,105,83,100,139,139,77,138,107,161


In [334]:
# create dfs by coral id incl condition and sample health status at each time point 
dataframes_dict = {}
# run for loop
for date in dates:
    # get corresponding condition col in colony data for each data 
    condition_col = colony_data.loc[:, colony_data.columns.str.contains(f'{date}_Condition')]
    # convert data_condition col into df and store in dict 
    condition_df=pd.DataFrame(condition_col)
    dataframes_dict[date] = condition_df
    # filter out not_visited into separate df and save in dict 
    visited_conditions = condition_df[
        (condition_df[f'{date}_Condition'] != "Not_Visited") &
        (condition_df[f'{date}_Condition'] != "Dead")
    ]
    dataframes_dict[f'{date}_visited'] = visited_conditions
    # make df containing samples at the same point, showing their sample type and health status
    sample_health = uml_sample[uml_sample["Month_year"] == date][["Sample_type", "Health_status"]]
    sample_health_condensed = sample_health.groupby(sample_health.index).first()
     # Merge condition data with sample health data
    merged_df = pd.merge(visited_conditions, sample_health_condensed, left_index=True, right_index=True, how='left')
    # store the merged DataFrame in the dictionary
    dataframes_dict[f'{date}_samples'] = merged_df
     # apply groupby(...).first() before merging with all condition data
    sample_health_all_condensed = sample_health.groupby(sample_health.index).first()
    # can also merge non-visited 
    merged_all=pd.merge(condition_df, sample_health_all_condensed, left_index=True, right_index=True, how='left')
    dataframes_dict[f'{date}_all'] = merged_all


In [335]:
# right now, manually look at each date...going to add all non-sampled into one df for easy viewing 
# also want to: sum up alive and visited corals and number of samples taken from each individual colony 
dataframes_dict['062024_samples'].shape

(38, 3)

In [336]:
print(dataframes_dict['062024_all'].shape)
dataframes_dict['062024_all'].head()

(219, 3)


Unnamed: 0_level_0,062024_Condition,Sample_type,Health_status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SSID_T1_1,Not_Visited,,
PAST_T1_2,Healthy,Core_RNAlater,Healthy
SSID_T1_3,Diseased,Core_RNAlater,Diseased
PSTR_T1_4,Not_Visited,,
SSID_T1_5,Not_Visited,,


In [337]:
print(dataframes_dict['062024'].shape)
print(dataframes_dict['062024'])
print(dataframes_dict['062024_visited'].shape)
print(dataframes_dict['062024_visited'])
print(sample_health.shape)
print(sample_health)

(219, 1)
           062024_Condition
ID                         
SSID_T1_1       Not_Visited
PAST_T1_2           Healthy
SSID_T1_3          Diseased
PSTR_T1_4       Not_Visited
SSID_T1_5       Not_Visited
...                     ...
OFAV_T6_32      Not_Visited
OANN_T6_33      Not_Visited
OFAV_T6_34      Not_Visited
OFAV_T6_35      Not_Visited
OANN_T6_36      Not_Visited

[219 rows x 1 columns]
(38, 1)
           062024_Condition
ID                         
PAST_T1_2           Healthy
SSID_T1_3          Diseased
PSTR_T1_12          Healthy
PAST_T1_21          Healthy
MCAV_T1_24          Healthy
ORBI_T1_25          Healthy
SSID_T2_51          Healthy
MCAV_T2_55          Healthy
PAST_T2_57          Healthy
MCAV_T2_60          Healthy
PAST_T2_63          Healthy
SSID_T2_66         Diseased
PAST_T2_68          Healthy
MCAV_T2_69          Healthy
SSID_T2_72         Diseased
SSID_T2_73               DC
OFAV_T2_76          Healthy
OFAV_T2_79          Healthy
SSID_T2_99         Diseased
MCAV_T3

In [338]:
# cycle thru samples dicts and view any mismatches 
# Initialize a list to store mismatches
mismatches = []

for date in dates:
    # Access the DataFrame from the dictionary
    df = dataframes_dict[f'{date}_all']

    # Condition 1: If 'Not_Visited' or 'Dead', there should be no sample
    condition1_violations = df[
        ((df[f'{date}_Condition'] == "Not_Visited") | (df[f'{date}_Condition'] == "Dead")) &
        (df['Sample_type'].notna() | df['Health_status'].notna())
    ]
    
    if not condition1_violations.empty:
        mismatches.append((date, 'Condition 1 Violations', condition1_violations))

    # Condition 2: If alive, condition should match health status of the sample
    condition2_violations = df[
        (df[f'{date}_Condition'] != "Not_Visited") & 
        (df[f'{date}_Condition'] != "Dead") &
        (df['Health_status'].notna()) & 
        (df[f'{date}_Condition'] != df['Health_status'])
    ]

    if not condition2_violations.empty:
        mismatches.append((date, 'Condition 2 Violations', condition2_violations))

# If there are mismatches, print or export them for review
if mismatches:
    for mismatch in mismatches:
        date, violation_type, df = mismatch
        print(f"Date: {date} - {violation_type}")
        print(df)
        print("\n")
else:
    print(date,"No mismatches found.")
# chatgpt code

Date: 092023 - Condition 2 Violations
           092023_Condition    Sample_type Health_status
ID                                                      
PAST_T1_2           CLP,CLB  Core_RNAlater       Healthy
SSID_T1_3          Diseased  Core_RNAlater       Healthy
MCAV_T1_7               CLP  Core_RNAlater       Healthy
SSID_T1_17    Diseased, CLB  Core_RNAlater      Diseased
PAST_T1_21              CLP  Core_RNAlater       Healthy
MCAV_T1_24              CLP  Core_RNAlater       Healthy
DLAB_T1_35              CLP  Core_RNAlater       Healthy
OFAV_T2_29          CLP,CLB  Core_RNAlater       Healthy
OANN_T2_30          CLP,CLB  Core_RNAlater      Diseased
PSTR_T2_32              CLP  Core_RNAlater       Healthy
PSTR_T2_54    Diseased, CLP  Core_RNAlater      Diseased
MCAV_T2_55              CLP  Core_RNAlater       Healthy
MCAV_T2_59              CLP  Core_RNAlater       Healthy
MCAV_T2_60          CLP,CLB  Core_RNAlater       Healthy
CNAT_T2_64              CLP  Core_RNAlater       H

In [287]:
## Summary stats 
# each time point, number of corals, number of live corals, num of samples divided by type 
dataframes_dict['062019_all']

Unnamed: 0_level_0,062019_Condition,Sample_type,Health_status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SSID_T1_1,Healthy,Core_EtOH,Healthy
PAST_T1_2,Healthy,Core_EtOH,Healthy
SSID_T1_3,Healthy,Core_EtOH,Healthy
PSTR_T1_4,Healthy,Core_EtOH,Healthy
SSID_T1_5,Healthy,Core_EtOH,Healthy
...,...,...,...
OFAV_T6_32,Not_Visited,,
OANN_T6_33,Not_Visited,,
OFAV_T6_34,Not_Visited,,
OFAV_T6_35,Not_Visited,,


In [288]:
# summary df plan  
#split by transect??
#df cols:       # tagged, # newly tagged, # total tagged (add the first two, should equal 219), # uml samples, # frozen, # immune
#rows = dates

In [339]:
colony_data['Date_InitialTag'].unique()


array(['6/21/19', '10/15/19', '6/24/19', '6/26/19', '5/21/22', '5/25/22',
       '12/5/22', '6/25/19', '6/22/19', '10/13/19', '5/24/22', '6/21/24',
       '6/23/19', '10/14/19', '5/20/22', '12/3/22', '1/13/24', '4/29/24'],
      dtype=object)

In [340]:
# Convert 'Date_InitialTag' to datetime format and then to 'mmyyyy' format
colony_data['Tag_Date_mmyyyy'] = pd.to_datetime(colony_data['Date_InitialTag']).dt.strftime('%m%Y')
colony_data['Tag_Date_mmyyyy']

  colony_data['Tag_Date_mmyyyy'] = pd.to_datetime(colony_data['Date_InitialTag']).dt.strftime('%m%Y')


ID
SSID_T1_1     062019
PAST_T1_2     062019
SSID_T1_3     062019
PSTR_T1_4     102019
SSID_T1_5     062019
               ...  
OFAV_T6_32    042024
OANN_T6_33    042024
OFAV_T6_34    042024
OFAV_T6_35    042024
OANN_T6_36    042024
Name: Tag_Date_mmyyyy, Length: 219, dtype: object

In [341]:
sample_data


Unnamed: 0_level_0,Month_year,Country,Location,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Time_sampled,...,Sample_type,SampleNum,Health_status,Sampling_notes,Sample_physical_location,Extraction_physical_location,Date_sequenced,Notes,Month,Year
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OFAV_T4_77,122022,BEL,CBC,12/5/22,CURLEW,4,,77,OFAV,,...,Core_frozen,18,Healthy,,,,,,12,2022
PAST_T1_2,092023,BEL,CBC,9/25/23,CBC30N,1,,2,PAST,,...,Core_RNAlater,171,Healthy,CLP 90%,UML_NARWHAL_R1_B10,,,,09,2023
PAST_T1_21,092023,BEL,CBC,9/25/23,CBC30N,1,,21,PAST,,...,Core_RNAlater,172,Healthy,No CL,UML_NARWHAL_R1_B10,,,,09,2023
SSID_T1_3,092023,BEL,CBC,9/25/23,CBC30N,1,,3,SSID,,...,Core_RNAlater,173,Healthy,CLP 80%; DC 20%,UML_NARWHAL_R1_B10,,,,09,2023
MCAV_T1_24,092023,BEL,CBC,9/25/23,CBC30N,1,,24,MCAV,,...,Core_RNAlater,174,Healthy,CLP 10%,UML_NARWHAL_R1_B10,,,,09,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OFAV_T4_78,062024,BEL,CBC,6/22/24,CURLEW,4,,78,OFAV,9:24,...,Core_RNAlater,1461,Healthy,sampled from edge,UML_NARWHAL_R5_B26,,,,06,2024
OFAV_T4_78,062024,BEL,CBC,6/22/24,CURLEW,4,,78,OFAV,9:24,...,Immune,1459,Healthy,sampled from edge,,,,,06,2024
MCAV_T4_76,062024,BEL,CBC,6/22/24,CURLEW,4,,76,MCAV,9:31,...,Core_frozen,1463,Healthy,sampled from edge,,,,,06,2024
MCAV_T4_76,062024,BEL,CBC,6/22/24,CURLEW,4,,76,MCAV,9:31,...,Core_RNAlater,1464,Healthy,sampled from edge,UML_NARWHAL_R5_B26,,,,06,2024


In [342]:
sample_data['Sample_type'].unique()

array(['Core_frozen', 'Core_RNAlater', 'Core_EtOH', 'Probiotics',
       'Immune', 'TEM', nan, 'Syringe'], dtype=object)

In [343]:
# exclude tem, na, or syringe for now 
# Exclude 'TEM', 'Syringe', or NaN values
filtered_samples = sample_data[
    (sample_data['Sample_type'] != 'TEM') & 
    (sample_data['Sample_type'] != 'Syringe') & 
    (sample_data['Sample_type'].notna())
]


In [347]:
filtered_samples['Transect'].unique()

array(['CURLEW', 'CBC30N', 'SR30N', 'Curlew', 'Lagoon', 'CBC Lagoon',
       'Lagoon ', 'BB', 'Hangman ', 'LAGOON'], dtype=object)

In [359]:
# make all t names consistent 
filtered_samples.loc[(filtered_samples['Transect'].str.contains("Curlew",case=False)),'Transect']='CURLEW'
filtered_samples.loc[(filtered_samples['Transect'].str.contains("Lagoon",case=False)),'Transect']='LAGOON'
filtered_samples.loc[(filtered_samples['Transect'].str.contains("Hangman",case=False)),'Transect']='HANGMAN'
filtered_samples['Transect'].unique()

array(['CURLEW', 'CBC30N', 'SR30N', 'LAGOON', 'BB', 'HANGMAN'],
      dtype=object)

In [361]:
# make sorted list of transects 
all_transects = filtered_samples['Transect'].unique()
sorted_transects = sorted(all_transects)

In [372]:
#make monthyear in chronological order 
filtered_samples['Month_year_datetime'] = pd.to_datetime(filtered_samples['Month_year'], format='%m%Y')

# Sort by the new datetime column
filter_samp_sorted = filtered_samples.sort_values(by='Month_year_datetime')

# Get sorted unique dates in the 'mmyyyy' format
sorted_dates = filter_samp_sorted['Month_year_datetime'].dt.strftime('%m%Y').unique()
sorted_dates

array(['062019', '102019', '052022', '122022', '092023', '112023',
       '122023', '012024', '022024', '042024', '062024'], dtype=object)

In [378]:
# still working on this 


# Initialize an empty list to store summary data for each date
summary_data = []

# Iterate through dates
for date in sorted_dates:
    # Filter data for the specific date
    date_data = filtered_samples[filtered_samples['Month_year'] == date]
    # Count tagged corals
    colony_data['
                
                
    # count number of corals sampled 
    num_tagged = len(date_data['NewTagNum'].unique())
        
        # Count UML samples
    num_uml_samples = num_core_samples = transect_data[
        (transect_data['Sample_type'] == 'Core_EtOH') | 
        (transect_data['Sample_type'] == 'Core_RNAlater')
    ].shape[0]
        
        # Count frozen samples
    num_frozen_samples = transect_data[transect_data['Sample_type'] == 'Core_frozen'].shape[0]
        
        # Count immune samples
    num_immune_samples = transect_data[transect_data['Sample_type'] == 'Immune'].shape[0]
        
        # Append the results to summary_data
    summary_data.append({
            'Date': date,
            'num_tagged': num_tagged,
            'num_uml_samples': num_uml_samples,
            'num_tx_samples': num_frozen_samples,
            'num_immune_samples': num_immune_samples
        })

# Convert summary_data into a DataFrame
summary_df = pd.DataFrame(summary_data)

# Display the summary DataFrame
summary_df


Unnamed: 0,Date,num_tagged,num_uml_samples,num_tx_samples,num_immune_samples
0,62019,69,11,11,11
1,102019,20,11,11,11
2,52022,64,11,11,11
3,122022,75,11,11,11
4,92023,68,11,11,11
5,112023,32,11,11,11
6,122023,32,11,11,11
7,12024,55,11,11,11
8,22024,31,11,11,11
9,42024,84,11,11,11


In [373]:
# Initialize an empty list to store summary data for each date
summary_data = []

# Iterate through dates
for date in sorted_dates:
    # Filter data for the specific date
    date_data = filtered_samples[filtered_samples['Month_year'] == date]
    
    # Optionally split by transect
    transects = date_data['Transect'].unique()
    
    for transect in sorted_transects:
        transect_data = date_data[date_data['Transect'] == transect]
        
        # Count tagged corals
        num_tagged = len(transect_data['NewTagNum'].unique())
        
        # Count newly tagged corals by matching 'Tag_Date_mmyyyy' with the current date
        newly_tagged = colony_data[colony_data['Tag_Date_mmyyyy'] == date]
        num_newly_tagged = newly_tagged.shape[0]
        
        # Total tagged
        total_tagged = num_tagged + num_newly_tagged
        
        # Count UML samples
        num_uml_samples = num_core_samples = transect_data[
            (transect_data['Sample_type'] == 'Core_EtOH') | 
            (transect_data['Sample_type'] == 'Core_RNAlater')
        ].shape[0]
        
        # Count frozen samples
        num_frozen_samples = transect_data[transect_data['Sample_type'] == 'Core_frozen'].shape[0]
        
        # Count immune samples
        num_immune_samples = transect_data[transect_data['Sample_type'] == 'Immune'].shape[0]
        
        # Append the results to summary_data
        summary_data.append({
            'Date': date,
            'Transect': transect,
            'num_tagged': num_tagged,
            'num_newly_tagged': num_newly_tagged,
            'num_total_tagged': total_tagged,
            'num_uml_samples': num_uml_samples,
            'num_tx_samples': num_frozen_samples,
            'num_immune_samples': num_immune_samples
        })

# Convert summary_data into a DataFrame
summary_df = pd.DataFrame(summary_data)

# Display the summary DataFrame
summary_df


Unnamed: 0,Date,Transect,num_tagged,num_newly_tagged,num_total_tagged,num_uml_samples,num_tx_samples,num_immune_samples
0,062019,BB,0,85,85,0,0,0
1,062019,CBC30N,24,85,109,24,24,0
2,062019,CURLEW,0,85,85,0,0,0
3,062019,HANGMAN,0,85,85,0,0,0
4,062019,LAGOON,31,85,116,32,32,0
...,...,...,...,...,...,...,...,...
61,062024,CBC30N,6,1,7,7,7,5
62,062024,CURLEW,6,1,7,6,6,6
63,062024,HANGMAN,0,1,1,0,0,0
64,062024,LAGOON,12,1,13,13,13,11
