# Fixing Spreadsheet Issues 
(spelling errors, simple mistakes, syntax issues)

In [1]:
import numpy as np
import pandas as pd
import os

## Sample Data

In [3]:
sample_data=pd.read_csv('../sctld/SCTLD_samples/Sample_Data/CBC_samples.csv')

In [4]:
sample_data

Unnamed: 0,Month_year,Country,Location,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Time_sampled,Time_processed,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Sample_physical_location,Extraction_physical_location,Date_sequenced,Notes
0,122022,BEL,CBC,12/5/22,CURLEW,4,,77,OFAV,,,Core_frozen,18,Healthy,,122022_BEL_CBC_T4_18_OFAV,,,,
1,92023,BEL,CBC,9/27/23,CBC30N,1,,1,SSID,,,Core_RNAlater,185,Diseased_Margin,only margin sample available,092023_BEL_CBC_T1_185_SSID,UML_NARWHAL_R1_B10,,,
2,92023,BEL,CBC,9/25/23,CBC30N,1,,2,PAST,,,Core_RNAlater,171,Healthy,CLP 90%,092023_BEL_CBC_T1_171_PAST,UML_NARWHAL_R1_B10,UML_NARWHAL_R2_B12,,
3,92023,BEL,CBC,9/25/23,CBC30N,1,,3,SSID,,,Core_RNAlater,173,Healthy,CLP 80%; DC 20%,092023_BEL_CBC_T1_173_SSID,UML_NARWHAL_R1_B10,UML_NARWHAL_R2_B12,,
4,92023,BEL,CBC,9/25/23,CBC30N,1,,12,PSTR,,,Core_RNAlater,177,Healthy,No CL,092023_BEL_CBC_T1_177_PSTR,UML_NARWHAL_R1_B10,UML_NARWHAL_R2_B12,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,82024,BEL,CBC,8/24/24,LAGOON,3,,33,SSID,10:58,11:34,Immune,1574,healthy,discoloration- some black & purple spots,,,,,
2200,82024,BEL,CBC,8/24/24,LAGOON,3,,34,PAST,11:02,11:35,Core_RNAlater,1564,healthy,,082024_BEL_CBC_T3_1564_PAST,UML_NARWHAL_R5_B27,UML_NARWHAL_R2_B15,,
2201,82024,BEL,CBC,8/24/24,LAGOON,3,,34,PAST,11:02,11:35,Core_frozen,1588,healthy,,,,,,
2202,82024,BEL,CBC,8/24/24,LAGOON,3,,34,PAST,11:02,11:35,Immune,1575,healthy,,,,,,


In [None]:
# fix inconsistent entries 
- transect 
- species
- health status 
- country 

In [6]:
sample_data["Transect"].unique()

array(['CURLEW', 'CBC30N', 'SR30N', 'Lagoon', 'Curlew', 'CBC Lagoon',
       'Lagoon ', 'BB', 'Hangman ', 'LAGOON'], dtype=object)

In [13]:
lagoon=(sample_data["Transect"]=="Lagoon") | (sample_data["Transect"]=="Lagoon ") | (sample_data["Transect"]=="CBC Lagoon") | (sample_data["Transect"]=="LAGOON")
sample_data.loc[lagoon,'Transect'] = 'Lagoon'

In [14]:
sample_data.loc[sample_data["Transect"]=="CURLEW",'Transect'] = 'Curlew'

In [15]:
sample_data.loc[sample_data["Transect"]=="Hangman ",'Transect'] = 'Hangman'

In [18]:
# make all species names uniform 
species_list=sample_data['Species'].unique()
print(species_list)

['OFAV' 'SSID' 'PAST' 'PSTR' 'MCAV' 'OANN' 'DLAB' 'CNAT' 'DL' 'OFAV/OANN'
 'OANN/OFAV?' 'MMEA' 'PAST ' 'Unknown' 'OFAV ']


In [19]:
# Change OFAV/OANN variations to ORBI
sample_data.loc[:,'Species']=sample_data['Species'].str.replace('OANN/OFAV?',"ORBI")
sample_data.loc[:,'Species']=sample_data['Species'].str.replace('OFAV/OANN',"ORBI")
# Fix space in PAST
sample_data.loc[:,'Species']=sample_data['Species'].str.replace('PAST ',"PAST")
sample_data['Species'].unique()

array(['OFAV', 'SSID', 'PAST', 'PSTR', 'MCAV', 'OANN', 'DLAB', 'CNAT',
       'DL', 'ORBI', 'MMEA', 'Unknown', 'OFAV '], dtype=object)

In [20]:
DL=sample_data['Species']=='DL'
sample_data[DL]

Unnamed: 0,Month_year,Country,Location,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Time_sampled,Time_processed,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Sample_physical_location,Extraction_physical_location,Date_sequenced,Notes
136,122022,BEL,CBC,12/2/22,CBC30N,1,,35,DL,,,Core_frozen,108,Healthy,,122022_BEL_CBC_T1_108_DL,,,,
312,122022,BEL,CBC,12/2/22,CBC30N,1,,35,DL,,,Core_EtOH,154,Healthy,,122022_BEL_CBC_T1_154_DL,UML_NARWHAL_R1_B4,,,


In [21]:
# Replace values using regular expressions
sample_data['Species'] = sample_data['Species'].replace('\\bDL\\b', 'DLAB', regex=True)

In [22]:
# Check that changes worked
species_list=sample_data['Species'].unique()
print(species_list)

['OFAV' 'SSID' 'PAST' 'PSTR' 'MCAV' 'OANN' 'DLAB' 'CNAT' 'ORBI' 'MMEA'
 'Unknown' 'OFAV ']


In [25]:
print(sample_data['Health_status'].unique())
# remove "missing" or non sampled corals from sample sheet 

# fix caps issues 

['Healthy' 'Diseased_Margin' 'Diseased_Tissue' 'Unknown' 'Diseased_tissue'
 'Diseased_margin' 'healthy' 'unknown']


In [42]:
sample_data.loc[sample_data['Health_status'] == 'healthy', 'Health_status'] = 'Healthy'
sample_data.loc[sample_data['Health_status'] == 'Diseased_tissue', 'Health_status'] = 'Diseased_Tissue'
sample_data.loc[sample_data['Health_status'] == 'Diseased_margin', 'Health_status'] = 'Diseased_Margin'

In [45]:
sample_data.loc[sample_data['Health_status'] == 'unknown', 'Health_status'] = 'Unknown'

In [44]:
sample_data.loc[:,'Country'].unique()

array(['BEL'], dtype=object)

### incorporating bleached/paled tissue/margin into conditions 
-idk if i want to commit to these right now, so skipping for now
- and some of these have already been fixed but you can see the notes below

In [45]:
# colony 3 SSID - can't find pic from 01/2024, but notes and pics show almost complete paling -> bleach in dates surrounding, calling paled_margin
tag_3=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="CBC30N") & (sample_data["NewTagNum"]=="3")
sample_data.loc[tag_3,"Health_status"] = "PALED_MARGIN"

In [47]:
# rest are healthy: 24, 25, 12, 21 (for now)
cbc_healthy=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="CBC30N")
sample_data[cbc_healthy]
sample_data.loc[cbc_healthy,"Health_status"] = "HEALTHY"

In [133]:
# check next transect - SR30N
sample_data[(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N")].sort_values(by="NewTagNum")

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
1536,2024-01-01,BEL,1/12/24,SR30N,2.0,,51,SSID,Immune,584,,CLB 30%,,,,Jan,2024
1534,2024-01-01,BEL,1/12/24,SR30N,2.0,,51,SSID,Core_frozen,606,,CLB 30%,,,,Jan,2024
1535,2024-01-01,BEL,1/12/24,SR30N,2.0,,51,SSID,Core_RNAlater,605,,CLB 30%,012024_BEL_CBC_T2_605_SSID,,,Jan,2024
1533,2024-01-01,BEL,1/12/24,SR30N,2.0,,55,MCAV,Immune,583,,Healthy,,,,Jan,2024
1532,2024-01-01,BEL,1/12/24,SR30N,2.0,,55,MCAV,Core_RNAlater,603,,Healthy,012024_BEL_CBC_T2_603_MCAV,,,Jan,2024
1531,2024-01-01,BEL,1/12/24,SR30N,2.0,,55,MCAV,Core_frozen,604,,Healthy,,,,Jan,2024
1524,2024-01-01,BEL,1/12/24,SR30N,2.0,,57,PAST,Immune,580,,Healthy,,,,Jan,2024
1523,2024-01-01,BEL,1/12/24,SR30N,2.0,,57,PAST,Core_RNAlater,597,,Healthy,012024_BEL_CBC_T2_597_PAST,,,Jan,2024
1522,2024-01-01,BEL,1/12/24,SR30N,2.0,,57,PAST,Core_frozen,598,,Healthy,,,,Jan,2024
1527,2024-01-01,BEL,1/12/24,SR30N,2.0,,60,MCAV,Immune,581,,Healthy,,,,Jan,2024


In [107]:
# go through pics, colonydata, samplingnotes to determine health status of each colony's samples

# change tag 51 to bleached margin
tag_51=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N") & (sample_data["NewTagNum"]=="51")
sample_data.loc[tag_51,"Health_status"]="BLEACHED_MARGIN"

In [None]:
# 55 - labelled as healthy but looks paled to me

In [108]:
# 72 bleached margin 
tag_72=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N") & (sample_data["NewTagNum"]=="72")
sample_data.loc[tag_72,"Health_status"]="BLEACHED_MARGIN"

In [111]:
# 73 bleached tissue for all but immune (bleached margin)
tag_73=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N") & (sample_data["NewTagNum"]=="73") & (sample_data["Sample_type"]!="Immune")
sample_data.loc[tag_73,"Health_status"]="BLEACHED_TISSUE"
tag_73_immune=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N") & (sample_data["NewTagNum"]=="73") & (sample_data["Sample_type"]=="Immune")
sample_data.loc[tag_73_immune,"Health_status"]="BLEACHED_MARGIN"

In [134]:
# change statuses to healthy ...51, 72, and 73 colonies labelled as clb or clp
# rest are healthy: 57, 60, 63, 68, 69, 76, 79
sr_conditions=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="SR30N")
sample_data.loc[sr_conditions,"Health_status"]="Healthy"

In [138]:
# check next transect - Lagoon
sample_data[(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="Lagoon ")].sort_values(by="NewTagNum")
# need to get rid of space in Lagoon also 

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
1421,2024-01-01,BEL,1/10/24,Lagoon,3.0,,14,MCAV,Core_frozen,643,,,,,,Jan,2024
1422,2024-01-01,BEL,1/10/24,Lagoon,3.0,,14,MCAV,Core_RNAlater,640,,,012024_BEL_CBC_T3_640_MCAV,,,Jan,2024
1423,2024-01-01,BEL,1/10/24,Lagoon,3.0,,14,MCAV,Immune,622,,,,,,Jan,2024
1426,2024-01-01,BEL,1/10/24,Lagoon,3.0,,20,PSTR,Immune,623,,,,,,Jan,2024
1425,2024-01-01,BEL,1/10/24,Lagoon,3.0,,20,PSTR,Core_RNAlater,642,,,012024_BEL_CBC_T3_642_PSTR,,,Jan,2024
1424,2024-01-01,BEL,1/10/24,Lagoon,3.0,,20,PSTR,Core_frozen,645,,,,,,Jan,2024
1412,2024-01-01,BEL,1/10/24,Lagoon,3.0,,38,OFAV,Core_frozen,636,,,,,,Jan,2024
1413,2024-01-01,BEL,1/10/24,Lagoon,3.0,,38,OFAV,Core_RNAlater,635,,,012024_BEL_CBC_T3_635_OFAV,,,Jan,2024
1414,2024-01-01,BEL,1/10/24,Lagoon,3.0,,38,OFAV,Immune,619,,,,,,Jan,2024
1415,2024-01-01,BEL,1/10/24,Lagoon,3.0,,5,SSID,Core_frozen,638,,,,,,Jan,2024


In [139]:
# all are healthy
lagoon_conditions=(sample_data['Health_status'].isna()) & (sample_data["Transect"]=="Lagoon ")
sample_data.loc[lagoon_conditions,"Health_status"]="Healthy"

In [140]:
# confirm there's no more NAs left
sample_data[sample_data['Health_status'].isna()]

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year


###### Investigate "missing" samples

In [146]:
sample_data[sample_data.loc[:,'Health_status'].str.contains("miss",case=False)]
# samples were not taken at that time?

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
565,2022-05-01,BEL,5/22/22,SR30N,2.0,337,53,MCAV,,,MISS_SAMPLE,,052022_BEL_CBC_T2__MCAV,,,May,2022
741,2022-05-01,BEL,5/20/22,Lagoon,3.0,21,7,PAST,,,MiSSING,Missed sample,052022_BEL_CBC_T3__PAST,,,May,2022
742,2022-05-01,BEL,5/20/22,Lagoon,3.0,358,13,PAST,,,MISSING,,052022_BEL_CBC_T3__PAST,,,May,2022


In [149]:
sample_data[(sample_data["NewTagNum"]=="53") & (sample_data["Transect"]=="SR30N")]
# sample was just not taken at that time point, deleting entry
sample_data.drop(index=565,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(index=565,inplace=True)


In [152]:
sample_data[(sample_data["NewTagNum"]=="7") & (sample_data["Transect"]=="Lagoon")]
# sample was just not taken at that time point, deleting entry
sample_data.drop(index=741,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(index=741,inplace=True)


In [154]:
sample_data[(sample_data["NewTagNum"]=="13") & (sample_data["Transect"]=="Lagoon")]
# sample was just not taken at that time point, deleting entry
sample_data.drop(index=742,inplace=True)
# I don't think this coral was ever visited again...

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(index=742,inplace=True)


In [19]:
sample_data[sample_data['Health_status']=="HealthyNot_Sampled"]

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
475,2022-12-01,BEL,12/2/22,CBC30N,1.0,,5,SSID,Core_frozen,,HealthyNot_Sampled,,122022_BEL_CBC_T1__SSID,,,Dec,2022


In [158]:
sample_data[(sample_data["NewTagNum"]=="5") & (sample_data["Transect"]=="CBC30N")]
# sample was just not taken at that time point, deleting entry
sample_data.drop(index=475,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(index=475,inplace=True)


###### Health status as just "disease" - change to diseased_tissue or diseased_margin

In [415]:
sample_data[sample_data['Health_status']=="Disease"]
# tag 19 - probably Disease_margin since there was so little healthy tissue left 


Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
1649,2024-04-01,Belize,4/25/24,CBC30N,1.0,,3,SSID,Immune,1067,Disease,"40% CTL, 10% DC",,,,Apr,2024
1716,2024-04-01,Belize,4/25/24,CBC Lagoon,3.0,,19,SSID,Core_frozen,967,Disease,"95% old mort, 95% DC, webbing",,,,Apr,2024
1717,2024-04-01,Belize,4/25/24,CBC Lagoon,3.0,,19,SSID,Core_RNAlater,1011,Disease,"95% old mort, 95% DC, webbing",,,,Apr,2024


In [161]:
# tag 19 - Disease_margin since there was so little healthy tissue left (looked at pics)
sample_data.loc[1716:1717,"Health_status"]="Diseased_Tissue"

In [28]:
conditions=(sample_data['NewTagNum']=="3") & (sample_data['Year']=="2024") & (sample_data['Transect']=="CBC30N")
sample_data[conditions]
# For immune sample, should be coded as diseased_tissue

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
1382,2024-01-01,BEL,1/10/24,CBC30N,1.0,,3,SSID,Core_frozen,558,,,,,,Jan,2024
1383,2024-01-01,BEL,1/10/24,CBC30N,1.0,,3,SSID,Core_RNAlater,557,,,012024_BEL_CBC_T1_557_SSID,,,Jan,2024
1384,2024-01-01,BEL,1/10/24,CBC30N,1.0,,3,SSID,Immune,551,,,,,,Jan,2024
1539,2024-02-01,Belize,2/23/24,CBC30N,1.0,,3,SSID,Core_frozen,763,Healthy,98% CLB,,,,Feb,2024
1540,2024-02-01,Belize,2/23/24,CBC30N,1.0,,3,SSID,Core_RNAlater,773,Healthy,98% CLB,,,,Feb,2024
1541,2024-02-01,Belize,2/23/24,CBC30N,1.0,,3,SSID,Immune,767,Healthy,98% CLB; Taken from edge of colony,,,,Feb,2024
1645,2024-04-01,Belize,4/25/24,CBC30N,1.0,,3,SSID,Core_frozen,886,Diseased_tissue,"40% CTL, 10% DC",,,,Apr,2024
1646,2024-04-01,Belize,4/25/24,CBC30N,1.0,,3,SSID,Core_RNAlater,932,Diseased_tissue,"40% CTL, 10% DC",,,,Apr,2024
1647,2024-04-01,Belize,4/25/24,CBC30N,1.0,,3,SSID,Core_frozen,887,Diseased_margin,"40% CTL, 10% DC",,,,Apr,2024
1648,2024-04-01,Belize,4/25/24,CBC30N,1.0,,3,SSID,Core_RNAlater,932,Diseased_margin,"40% CTL, 10% DC",,,,Apr,2024


In [143]:
# change immune sample, colony 3 health status to Diseased_Tissue
sample_data.loc[1649,"Health_status"]="Diseased_Tissue"

###### Dont know what to do with this right now

In [17]:
sample_data[sample_data['Health_status']=='CLP 100%']

Unnamed: 0,Month_year,Country,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Sample_type,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Date_sequenced,Notes,Month,Year
298,2023-09-01,BEL,9/27/23,Lagoon,3.0,,70,PSTR,Immune,269,CLP 100%,Healthy,09_2023_BEL-CBC_T3-269,,Crumbly sample,Sep,2023


###### Fixing capitalization issues - changing to snake case

In [178]:
sample_data['Health_status'] = sample_data['Health_status'].str.replace(r'(?i)Diseased_margin', 'diseased_margin', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data['Health_status'] = sample_data['Health_status'].str.replace(r'(?i)Diseased_margin', 'diseased_margin', regex=True)


In [180]:
sample_data['Health_status'] = sample_data['Health_status'].str.replace(r'(?i)Diseased_tissue', 'diseased_tissue', regex=True)
sample_data['Health_status'] = sample_data['Health_status'].str.replace('Healthy', 'healthy')
sample_data['Health_status'] = sample_data['Health_status'].str.replace(r'(?i)HealthyTissue', 'healthy', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data['Health_status'] = sample_data['Health_status'].str.replace(r'(?i)Diseased_tissue', 'diseased_tissue', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data['Health_status'] = sample_data['Health_status'].str.replace('Healthy', 'healthy')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [181]:
sample_data['Health_status'].unique() 

array(['healthy', 'diseased_margin', 'diseased_tissue', 'CLP 100%'],
      dtype=object)

In [385]:
# why are there NAs in sampleNum?? if the sample wasn't taken why was it added?
sample_data[sample_data['SampleNum'].isna()]

Unnamed: 0,Month_year,Country,Location,CollectionDate,Transect,TransectNum,OldTagNum,NewTagNum,Species,Time_sampled,...,SampleNum,Health_status,Sampling_notes,Tubelabel_species,Sample_physical_location,Extraction_physical_location,Date_sequenced,Notes,Month,Year
475,2022-12-01,BEL,CBC,12/2/22,CBC30N,1.0,,5,SSID,,...,,HealthyNot_Sampled,,122022_BEL_CBC_T1__SSID,,,,,Dec,2022
565,2022-05-01,BEL,CBC,5/22/22,SR30N,2.0,337,53,MCAV,,...,,MISS_SAMPLE,,052022_BEL_CBC_T2__MCAV,,,,,May,2022
566,2022-05-01,BEL,CBC,5/22/22,SR30N,2.0,347,347,PAST,,...,,,,052022_BEL_CBC_T2__PAST,,,,,May,2022
741,2022-05-01,BEL,CBC,5/20/22,Lagoon,3.0,21,7,PAST,,...,,MiSSING,Missed sample,052022_BEL_CBC_T3__PAST,,,,,May,2022
742,2022-05-01,BEL,CBC,5/20/22,Lagoon,3.0,358,13,PAST,,...,,MISSING,,052022_BEL_CBC_T3__PAST,,,,,May,2022
812,2019-10-01,BEL,CBC,10/15/19,CBC30N,1.0,407,4,PSTR,,...,,Healthy,,102019_BEL_CBC_T1__PSTR,SERC,,,,Oct,2019
813,2019-10-01,BEL,CBC,10/15/19,CBC30N,1.0,413,6,PSTR,,...,,Healthy,mucus sample taken adjacent to punch,102019_BEL_CBC_T1__PSTR,SERC,,,,Oct,2019
814,2019-10-01,BEL,CBC,10/15/19,CBC30N,1.0,418,12,PSTR,,...,,Healthy,mucus sample taken adjacent to punch,102019_BEL_CBC_T1__PSTR,SERC,,,,Oct,2019
815,2019-10-01,BEL,CBC,10/15/19,CBC30N,1.0,417,417,PSTR,,...,,Healthy,mucus sample taken adjacent to punch,102019_BEL_CBC_T1__PSTR,SERC,,,,Oct,2019
816,2019-10-01,BEL,CBC,10/15/19,CBC30N,1.0,404,404,PSTR,,...,,Healthy,mucus sample taken adjacent to punch,102019_BEL_CBC_T1__PSTR,SERC,,,,Oct,2019


In [None]:
sample_data.dropna(subset=["SampleNum"], inplace=True)

In [None]:
sample_data[sample_data['NewTagNum'].isna()]
# removing 'not sampled' entry 

In [None]:
# removing 'not sampled' entry 
delete=sample_data[sample_data["Sampling_notes"]=="NOT SAMPLED"].index
sample_data.drop(index=delete,inplace=True)
#sample_data.drop(sample_data["Sampling_notes"]=="NOT SAMPLED")

In [333]:
# make all "disease tissue" entries uniform
#disease_status=sample_data['Health_status'].str.contains("disease",case=False)
#sample_data[disease_status]

In [334]:
diseased=sample_data['Health_status'].str.contains("disease",case=False)
print(diseased.sum())
#sample_data[diseased]

409


In [341]:
# investigate weird ones
#entries_to_ignore= "

In [342]:
# load colony data to figure out health statuses of these samples 

Unnamed: 0,Date_InitialTag,Transect,TransectNum,OldTagNum,NewTagNum,Species,Meter,Meters_90,Direction,Size_Class,...,112023_Percentage,122023_Condition,122023_Percentage,012024_Condition,012024_Percentage,022024_Condition,022024_Percentage,042024_Condition,042024_Percentage,COLONIES_TO_LOOK_AT
0,6/21/19,CBC30N,1,349,1,SSID,1.5,0.5,right,5.0,...,,Diseased,,Diseased,,Diseased,,,,
1,6/21/19,CBC30N,1,334,2,PAST,1.4,0.1,left,4.0,...,100%,CLB,100%,Dead,,TL,90%,,,
2,6/21/19,CBC30N,1,346,3,SSID,6.3,0.8,right,5.0,...,"5%,95%",CLP,80%,CLP,,CLB,98%,,,
3,10/15/19,CBC30N,1,407,4,PSTR,6.5,0.5,right,3.0,...,,Dead,,Dead,,Dead,,,,
4,6/24/19,CBC30N,1,394,5,SSID,4.3,1.2,left,5.0,...,,Diseased,,Diseased,,Diseased,,,,


In [344]:
# determining "DL" species 
tag_35=colony_data['NewTagNum']=='35'
colony_data[tag_35]
# think its just supposed to be dlab 

Unnamed: 0,Date_InitialTag,Transect,TransectNum,OldTagNum,NewTagNum,Species,Meter,Meters_90,Direction,Size_Class,...,112023_Percentage,122023_Condition,122023_Percentage,012024_Condition,012024_Percentage,022024_Condition,022024_Percentage,042024_Condition,042024_Percentage,COLONIES_TO_LOOK_AT
25,5/25/22,CBC30N,1,35.0,35,DLAB,14.5,0.8,left,,...,,Healthy,,Healthy,,Healthy,,,,
115,6/23/19,Lagoon,3,29.0,35,MMEA,38.5,3.0,right,2.0,...,,Dead,,Dead,,Dead,,,,
182,1/13/24,BB,5,,35,OFAV,2.3,4.5,R,,...,,,,CLP,20%,CLP,,,,
200,4/29/24,HANGMAN,6,,35,OFAV,30.1,1.4,R,,...,,,,,,,,,,
