## Add a column of families / subfamiles to our species occurrences 
- import checklists of insects from various families / subfamilies. These were acquired through the Catalogue of Life / GBIF.org
- merge checklists, keeping the key to which family / subfamily they belong
- add row to our sound files dataset (MLNS_Insects.csv) for these families
- clean up nans



### VVV import checklists of insects from various families / subfamilies and merge

In [223]:
import numpy as nm
import pandas as pd
import matplotlib.pyplot as plt
import os

## Load sound files dataset
chirps = pd.read_csv('MLNS_Insects_with_genus.csv')

## Load Checklists

dataframes = {}
for file in os.listdir('/Species_Checklists'):

    # Extract the file name without the extension
    file_name = file.split('.')[0]  
    
    # Read the file into a pandas DataFrame
    df = pd.read_csv(file, delimiter='\t')
    
    # Store the DataFrame in the dictionary with the file name as the key
    dataframes[file_name] = df

for key, df in dataframes.items():
    df['fam_or_subfam'] = key

all_checklists = pd.concat(dataframes.values(), ignore_index=True)


FileNotFoundError: [Errno 2] No such file or directory: 'MLNS_Insects_with_genus.csv'

### VVV Make the scientific name column the same for both datasets and merge

In [None]:
all_checklists['scientific_name'] = all_checklists['col:scientificName']

##merge the fam_or_subfam column with the insect audio files datset

merged_df = pd.merge(chirps, all_checklists[['scientific_name', 'fam_or_subfam']].drop_duplicates(), on='scientific_name', how='left')



### VVV Clear up nans (rows without an assigned family or subfamily)

In [None]:
## Clear up nans

# I forgot to import one subfamily, Hapithinae
hapithinae = ['Orocharis saltator', 'Orocharis', 'Hapithus agitator', 'Hapithus melodius', 
                    'Orocharis gryllodes', 'Orocharis vaginalis', 'Hapithus vagus', 
                    'Orocharis luteolira', 'Orocharis diplastes', 'Orocharis nigrifrons', 
                    'Orocharis tricornis', 'Hapithus protos', 'Hapithus','Laurepa valida',
                    'Laurepa', 'Laurepa kropion', 'Antillicharis facetus']
merged_df.loc[merged_df['scientific_name'].isin(hapithinae), 'fam_or_subfam'] = 'Hapithinae'

# I didn't import a grasshopper checklist, but their weren't that many
acrididae = ['Froggattina australis', 'Dissosteira carolina', 'Romalea microptera', 'Acrididae']
merged_df.loc[merged_df['scientific_name'].isin(acrididae), 'fam_or_subfam'] = 'Acrididae'

# this is a new family...very few instances
phalangopsidae = ['Amphiacusta annulipes', 'Amphiacusta']
merged_df.loc[merged_df['scientific_name'].isin(phalangopsidae), 'fam_or_subfam'] = 'Phalangopsidae'

# unnamed gryllacrididae (leaf rolling / rasping crickets) 
merged_df.loc[merged_df['scientific_name']=='Gryllacrididae', 'fam_or_subfam'] = 'Gryllacrididae'

# place these orphans back in enopterinae. Not sure why they weren't in their rightful spot before. found their rightful spot by referencing the Orthopterist Society Table of Sp
enopterinae = ['Diatrypa sibilans', 'Paroecanthus', 'Paroecanthus hwinanus']
merged_df.loc[merged_df['scientific_name'].isin(enopterinae), 'fam_or_subfam'] = 'Eneopterinae'

# placing a couple species in this subfamily. Very few
listroscelidinae = ['Neobarrettia spinosa', 'Neobarrettia victoriae']
merged_df.loc[merged_df['scientific_name'].isin(listroscelidinae), 'fam_or_subfam'] = 'Listroscelidinae'

# get rid of Bufonidae (frogs), unspecified Gryllidae (family of crickets which is HUGE and diverse), unspecified Tettigoniidae (family of katydids, which is huge and diverse) and unspecified superfamilies
exclude = ['Gryllidae', 'Tettigoniidae', 'Bufonidae', 'Tettigonioidea','Grylloidea']
merged_df = merged_df.loc[merged_df['scientific_name'].isin(exclude)==False]



### VVV Check for nans

In [192]:
merged_df[merged_df.fam_or_subfam.isna()]

Unnamed: 0,cat_num,format,common_name,scientific_name,background_species,recordist,date,year,month,day,...,ebird_species_code,taxon_category,taxonomic_sort,recordist_2,average_community_rating,number_of_ratings,asset_tags,original_image_height,original_image_width,fam_or_subfam
46,49780.0,Audio,True Crickets,Gryllidae,,Kevin J. Colver,7/11/90,1990.0,7.0,11.0,...,t-10520072,,,,5.0,4,,,,
48,54804.0,Audio,Jumping Bush Cricket,Hapithus saltator,Pterophylla camellifolia -- Common True Katydid,Randolph Little,9/4/09,2009.0,9.0,4.0,...,t-11850192,species,,,3.0,4,,,,
50,54806.0,Audio,Jumping Bush Cricket,Hapithus saltator,Oecanthus exclamationis -- Davis's Tree Cricket,Randolph Little,9/5/09,2009.0,9.0,5.0,...,t-11850192,species,,,2.0,4,,,,
68,57147.0,Audio,True Crickets,Gryllidae,Chordeiles minor -- Common Nighthawk; Vireo ol...,William W. H. Gunn,7/21/67,1967.0,7.0,21.0,...,t-10520072,,,,3.0,4,,,,
69,57148.0,Audio,True Crickets,Gryllidae,,William W. H. Gunn,3/16/82,1982.0,3.0,16.0,...,t-10520072,,,,2.0,4,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13204,305932.0,Audio,True Crickets,Gryllidae,,Brad Walker,11/7/20,2020.0,11.0,7.0,...,t-10520072,,,,2.0,4,,,,
13338,307147.0,Audio,Jumping Bush Cricket,Hapithus saltator,,Brad Walker,8/25/21,2021.0,8.0,25.0,...,t-11850192,species,,,3.0,4,,,,
13349,307159.0,Audio,Jumping Bush Cricket,Hapithus saltator,,Brad Walker,9/6/21,2021.0,9.0,6.0,...,t-11850192,species,,,2.0,4,,,,
13352,307162.0,Audio,Jumping Bush Cricket,Hapithus saltator,,Brad Walker,9/10/21,2021.0,9.0,10.0,...,t-11850192,species,,,3.0,4,,,,


### VVV Write the CSV

In [221]:
# write csv
merged_df.to_csv('../../MLNS_Insects_Fams.csv', index=False)