In [None]:
%run validate_partner_manifest_dev.ipynb

In [None]:
fn = '../results/20241029_bge/BGKU_2024_BIOSCAN_Manifest_V2.0_am60.xlsx'
df = get_data(fn, sheet='TAB 2 Metadata Entry')

In [None]:
df = fix_date_formats(df)

In [None]:
v = infer_bioscan_version(df)

In [None]:
df = validate_series(df)

In [None]:
df = remove_nonbreaking_spaces(df)
df = remove_trailing_spaces(df, title='sample')

In [None]:
template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx'
template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')

In [None]:
check_columns(df, template_df, bioscan_version=v)

In [None]:
valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')

In [None]:
df['CATCH_LOT'] = df['CATCH_LOT'].replace('','NOT_APPLICABLE')
validate_regex('CATCH_LOT', df, na_values=[])

In [None]:
contrib_sheet='TAB 1 Contributors'
contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)

In [None]:
df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].apply(lambda x: x[0] + str(int(x[1:])))
df['TUBE_OR_WELL_ID'].value_counts()

In [None]:
df, gal, partner_code = validate_plates_wells(
        df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)

In [None]:
df['ORGANISM_PART'].replace('', 'NOT_APPLICABLE', inplace=True)

In [None]:
df, is_blank = check_blanks(df, bioscan=True)

In [None]:
df.loc[~is_blank, 'OTHER_INFORMATION'] = df['ORGANISM_PART']

In [None]:
df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper().str.replace(',','|')
df['ORGANISM_PART'].replace({
    'ENTIRE INDIVIDUAL':'WHOLE_ORGANISM',
    'ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'IMAGO LEG':'LEG',
    'MARGINAL PIECE':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'ORGANISM':'WHOLE_ORGANISM',
    'EXUVIUM':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'IMAGO ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'IMAGO':'WHOLE_ORGANISM',
    'BODY PART':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'PUPA':'WHOLE_ORGANISM',
    'PART PUPA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',
    'LEG| ANTENA':'LEG| **OTHER_SOMATIC_ANIMAL_TISSUE**'
}, inplace=True)

In [None]:
validate_values('ORGANISM_PART', df, valid_dict, sep='|')

In [None]:
validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)

In [None]:
validate_values('BOTTLE_DIRECTION', 
                    df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples
                    valid_dict)

In [None]:
validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])

In [None]:
check_catch_lot_dates(df[~is_blank])

In [None]:
df['DECIMAL_LATITUDE'] = df['DECIMAL_LATITUDE'].str.rstrip(',')

In [None]:
validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])

In [None]:
df['DECIMAL_LONGITUDE'] = df['DECIMAL_LONGITUDE'].str.rstrip(',')

In [None]:
validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])

In [None]:
validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])

In [None]:
validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])

In [None]:
validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])

In [None]:
validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])

In [None]:
validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])

In [None]:
compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])

In [None]:
df['PREDICTED_ORDER_OR_GROUP'].replace({
    'Neotaenioglossa':'Neotaenioglossa',
    'Basommatophora':'Basommatophora',
    'Odonáta':'Odonata',
    'Oligochaeta gen. sp.':'Oligochaeta'
}, inplace=True)
df['PREDICTED_FAMILY'].replace({
    'none':'',
    'fam.':'',
    'Glossiphoniidae)':'Glossiphoniidae'
}, inplace=True)
df['PREDICTED_GENUS'].replace({
    'none':'',
    'gen.':'',
    'genus':''
}, inplace=True)
df['PREDICTED_SCIENTIFIC_NAME'].replace({
    'sp':'',
    'sp.':'',
}, inplace=True)

In [None]:
df['PREDICTED_SCIENTIFIC_NAME'] = df['PREDICTED_SCIENTIFIC_NAME'].str.replace('sp ','sp_')

In [None]:
df.loc[
    (df['PREDICTED_SCIENTIFIC_NAME'] != '') & ~df['PREDICTED_SCIENTIFIC_NAME'].str.contains(' '),
    'PREDICTED_SCIENTIFIC_NAME'
] = df['PREDICTED_GENUS'] + ' ' + df['PREDICTED_SCIENTIFIC_NAME']


In [None]:
df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])

In [None]:
df['SEX'] = df['SEX'].str.upper()

In [None]:
validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])
validate_specimen_id_risk(df)
validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])
validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])
validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])
validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])
validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])
# white cols - validated for all samples
validate_freetext('MORPHOSPECIES_DESCRIPTION', df)
validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)
validate_freetext('HABITAT', df)
validate_freetext('PRESERVATION_APPROACH', df)
# TODO check if STS will need something here
validate_freetext('COLLECTOR_SAMPLE_ID', df)
validate_freetext('VOUCHER_ID', df)
validate_regex('ELEVATION', df, na_values=[''])
validate_freetext('OTHER_INFORMATION', df)
# validate_freetext('MISC_METADATA', df)


In [None]:
validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])

In [None]:
df.to_excel('../results/20241029_bge/BGKU_2024_patched.xlsx', index=False)