In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as graph
import seaborn as sns

import cartopy.crs as ccrs

import statsmodels

import os
from tqdm import tqdm, trange

from convertbng.util import convert_bng, convert_lonlat
import utm

In [2]:
# code and species dictionary

code_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Alberta/To_look_at/Banff_NP_Freshwater_Lake_Fish_Index_2017_data_dictionary.csv')
# code_df = code_df.iloc[:-6]
code_df.rename(columns={'Data_Value_Valeur_de_la_donnée': 'code_name', 
                        'Value_Description_EN_Description_de_la_valeur': 'common_name'}, inplace=True)
code_df = code_df[['code_name', 'common_name']]
code_df['common_name'] = code_df['common_name'].str.lower().str.replace(' ', '_')
display(code_df.sample(5))

dict_code_name = code_df.set_index('code_name').to_dict()['common_name']

Unnamed: 0,code_name,common_name
3,BLTR,bull_trout
30,GOLD,goldeye
45,RNTR,rainbow_trout
20,BKTR,brook_trout
23,BLTR,bull_trout


In [3]:
new_codes = {'LNDC': 'longnose_dace', 'BNTR': 'brown_trout', 'RNTR1': 'rainbow_trout', 'CTTR1': 'cutthroat_trout', 
             'WHSC': 'white_sucker', 'BLTRCTTR(N&I)': 'bull_trout_x_cutthroat_trout', 'CTTR(hybrids)': 'cutthroat_trout', 
             'CTTRhybrids': 'cutthroat_trout', 'SUCK': 'white_sucker'}

dict_code_name.update(new_codes)

In [4]:
# Species dictionay

species_name_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv')

dict_species_name = species_name_df.set_index('common_name').to_dict()['scientific_name']

In [5]:
new_names = {'nine-spine_stickleback': 'pungitius_pungitius'}

dict_species_name.update(new_names)
dict_species_name

{'lake_sturgeon': 'acipenser_fulvescens',
 'green_sturgeon': 'acipenser_medirostris',
 'atlantic_sturgeon': 'acipenser_oxyrinchus',
 'white_sturgeon': 'acipenser_transmontanus',
 'sturgeon': 'acipenseridae_spp',
 'sturgeons': 'acipenseridae_spp',
 'chiselmouth': 'acrocheilus_alutaceus',
 'poachers': 'agonidae_spp',
 'poacher_spp': 'agonidae_spp',
 'northern_spearnose_poacher': 'agonopsis_vulsa',
 'northern_spparnose_poacher': 'agonopsis_vulsa',
 'blueback_herring': 'alosa_aestivalis',
 'skipjack_herring': 'alosa_chrysochloris',
 'alewife': 'alosa_pseudoharengus',
 'gaspereau': 'alosa_pseudoharengus',
 'alewife_(gaspereau)': 'alosa_pseudoharengus',
 'american_shad': 'alosa_sapidissima',
 'allis_shad': 'alosa_spp',
 'rock_bass': 'ambloplites_rupestris',
 'tiger_salamander': 'ambystoma_tigrinum',
 'black_bullhead': 'ameiurus_melas',
 'black_catfish': 'ameiurus_melas',
 'bullhead': 'ameiurus_melas',
 'black_bullhead_x_brown_bullhead': 'ameiurus_melas_x_ameiurus_nebulosus',
 'yellow_bullhea

In [7]:
species_name_df = pd.DataFrame.from_dict({'common_name': list(dict_species_name.keys()), 
                                        'scientific_name' : list(dict_species_name.values())})
# species_name_df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv', index=False)

# Nova Scotia

In [5]:
# Acadia uni juvenile fish

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/acadia_uni_juvenile_fish.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'eventDate', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'year', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['eventDate', 'class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
df.head()

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long
0,myoxocephalus_aenaeus,-66.1594,43.8533,Northwest Atlantic,2005,43.853_-66.159
1,pseudopleuronectes_americanus,-65.9598,44.4959,Northwest Atlantic,2006,44.496_-65.96
2,pseudopleuronectes_americanus,-64.3409,44.4315,Northwest Atlantic,2006,44.432_-64.341
3,myoxocephalus_octodecemspinosus,-66.2009,44.086,Northwest Atlantic,2005,44.086_-66.201
4,myoxocephalus_aenaeus,-66.1176,44.3465,Northwest Atlantic,2005,44.346_-66.118


In [6]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long,myoxocephalus_aenaeus,pseudopleuronectes_americanus,myoxocephalus_octodecemspinosus,gasterosteus_aculeatus,...,alosa_pseudoharengus,sphoeroides_spengleri,pollachius_virens,gadus_morhua,dactylopterus_volitans,anguilla_rostrata,fundulus_heteroclitus_heteroclitus,peprilus_triacanthus,caranx_hippos,pomatomus_saltatrix
0,myoxocephalus_aenaeus,-66.1594,43.8533,Northwest Atlantic,2005,43.853_-66.159,1.0,1.0,,,...,1.0,,,,,,,,,
1,pseudopleuronectes_americanus,-65.9598,44.4959,Northwest Atlantic,2006,44.496_-65.96,1.0,1.0,,1.0,...,,,,,,,,,,
2,pseudopleuronectes_americanus,-64.3409,44.4315,Northwest Atlantic,2006,44.432_-64.341,1.0,1.0,,1.0,...,,,,,,,1.0,,,
3,myoxocephalus_octodecemspinosus,-66.2009,44.086,Northwest Atlantic,2005,44.086_-66.201,,1.0,1.0,1.0,...,1.0,,,,,1.0,,,,
4,myoxocephalus_aenaeus,-66.1176,44.3465,Northwest Atlantic,2005,44.346_-66.118,1.0,1.0,,1.0,...,,,,,,,,,,


In [7]:
df['year'].max()

2006

In [8]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/acadia_uni_juvenile_fish_occurence_2005_2006.csv', 
          index=False)

314
36


Unnamed: 0,longitude,latitude,waterbody_name,year,lat_long,myoxocephalus_aenaeus,pseudopleuronectes_americanus,myoxocephalus_octodecemspinosus,gasterosteus_aculeatus,lotidae,...,alosa_pseudoharengus,sphoeroides_spengleri,pollachius_virens,gadus_morhua,dactylopterus_volitans,anguilla_rostrata,fundulus_heteroclitus_heteroclitus,peprilus_triacanthus,caranx_hippos,pomatomus_saltatrix
3,-66.2009,44.086,Northwest Atlantic,2005,44.086_-66.201,,1.0,1.0,1.0,1.0,...,1.0,,,,,1.0,,,,
84,-65.606,43.515,Northwest Atlantic,2005,43.515_-65.606,1.0,1.0,,1.0,1.0,...,1.0,,,,,,,,,
39,-62.8599,44.8037,Northwest Atlantic,2006,44.804_-62.86,1.0,1.0,,1.0,1.0,...,,,,,,,,,,
66,-64.1583,44.5732,Northwest Atlantic,2006,44.573_-64.158,,,,,,...,,,,,,,,,,
14,-66.1176,44.3465,Northwest Atlantic,2006,44.346_-66.118,1.0,,,1.0,,...,,,,,,,,,,


### Acer Marine Resource

In [9]:
# Acer marine resource

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/acer_marine_resource.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'eventDate', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'year', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['eventDate', 'class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)

df.replace('fundulus_heteroclitus_heteroclitus', 'fundulus_heteroclitus', inplace=True)
display(df.head())

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long
1,fundulus_heteroclitus,-64.811251,43.864672,,1987,43.865_-64.811
3,myoxocephalus_aenaeus,-64.811251,43.864672,,1987,43.865_-64.811
11,syngnathus_fuscus,-64.811251,43.864672,,1987,43.865_-64.811
12,clupea_harengus,-64.811251,43.864672,,1987,43.865_-64.811
18,pungitius_pungitius,-64.811251,43.864672,,1987,43.865_-64.811


In [10]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long,fundulus_heteroclitus,myoxocephalus_aenaeus,syngnathus_fuscus,clupea_harengus,pungitius_pungitius,gasterosteus_aculeatus,gasterosteidae,urophycis_regia,gasterosteus_wheatlandi,anguilla_rostrata,pseudopleuronectes_americanus,ammodytes_americanus,pollachius_virens,apeltes_quadracus
1,fundulus_heteroclitus,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,myoxocephalus_aenaeus,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
11,syngnathus_fuscus,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12,clupea_harengus,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18,pungitius_pungitius,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/acer_marine_1987.csv', 
          index=False)

43
6


Unnamed: 0,longitude,latitude,waterbody_name,year,lat_long,fundulus_heteroclitus,myoxocephalus_aenaeus,syngnathus_fuscus,clupea_harengus,pungitius_pungitius,gasterosteus_aculeatus,gasterosteidae,urophycis_regia,gasterosteus_wheatlandi,anguilla_rostrata,pseudopleuronectes_americanus,ammodytes_americanus,pollachius_virens,apeltes_quadracus
113,-64.820821,43.873328,,1987,43.873_-64.821,,,,,,1.0,,,,,,,,
1,-64.811251,43.864672,,1987,43.865_-64.811,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
180,-64.850342,43.822338,,1987,43.822_-64.85,,,,,1.0,,,,,,,,,
103,-64.783365,43.864615,,1987,43.865_-64.783,,,,,,1.0,,,,,,,,1.0
64,-64.836757,43.814653,,1987,43.815_-64.837,1.0,,,,1.0,,,,,,,,,1.0


### Biochem

In [12]:
# Biochem

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/biochem.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'eventDate', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'year', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['eventDate', 'class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
df.head()

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long
0,sebastes_norvegicus,-61.825001,46.974998,,1973,46.975_-61.825
2,clupea_harengus,-59.89167,47.825001,,1972,47.825_-59.892
3,lumpenus,-61.85833,48.091671,,1972,48.092_-61.858
4,tautogolabrus_adspersus,-63.075001,46.791672,,1975,46.792_-63.075
5,aspidophoroides_monopterygius,-62.35833,46.974998,,1972,46.975_-62.358


In [13]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long,sebastes_norvegicus,clupea_harengus,lumpenus,tautogolabrus_adspersus,...,stichaeidae,lophius_americanus,hippoglossus_hippoglossus,reinhardtius_hippoglossoides,teleostei,pollachius_virens,myctophidae,merluccius_bilinearis,pleuronectiformes,zoarcidae
0,sebastes_norvegicus,-61.825001,46.974998,,1973,46.975_-61.825,1.0,,,1.0,...,,,,,,,,,,
2,clupea_harengus,-59.89167,47.825001,,1972,47.825_-59.892,1.0,1.0,,,...,,,,,,,,,,
3,lumpenus,-61.85833,48.091671,,1972,48.092_-61.858,,,1.0,,...,,,,,,,,,,
4,tautogolabrus_adspersus,-63.075001,46.791672,,1975,46.792_-63.075,,,,1.0,...,,,,,,,,,,
5,aspidophoroides_monopterygius,-62.35833,46.974998,,1972,46.975_-62.358,,,1.0,,...,,,,,,,,,,


In [14]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/biochem_1972_1975_occurence.csv', 
          index=False)

3084
438


Unnamed: 0,longitude,latitude,waterbody_name,year,lat_long,sebastes_norvegicus,clupea_harengus,lumpenus,tautogolabrus_adspersus,aspidophoroides_monopterygius,...,stichaeidae,lophius_americanus,hippoglossus_hippoglossus,reinhardtius_hippoglossoides,teleostei,pollachius_virens,myctophidae,merluccius_bilinearis,pleuronectiformes,zoarcidae
2042,-63.308331,47.441669,,1972,47.442_-63.308,,,,,,...,,,,,,,,,,
493,-62.208328,45.825001,,1974,45.825_-62.208,,,,,1.0,...,,,,,,,,,,
408,-60.89167,47.424999,,1972,47.425_-60.892,,,1.0,,,...,,,,,,,,,,
214,-61.325001,48.541672,,1974,48.542_-61.325,1.0,,,,,...,,,,,,,,,,
375,-61.441669,47.125,,1973,47.125_-61.442,,,,1.0,,...,,,,,,,,,,


### Brook Trout

In [15]:
# Brook trout

df_dict = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Used/Kejimkujik_NP_Freshwater_Brook_Trout_data_dictionary.csv')
df_dict.rename(columns={'Data_Value_Valeur_de_la_donnée': 'Fishing Location', 
                        'Value_Description_EN_Description_de_la_valeur': 'location_name'}, inplace=True)

df_dict = df_dict[['Fishing Location', 'location_name']]
df_dict = df_dict[~df_dict['Fishing Location'].isin(['na', 'FF', 'O', '0', '1', 'Y', 'N'])]

# print(len(df_dict['Fishing Location'].unique()), len(df))
df_dict.head()

Unnamed: 0,Fishing Location,location_name
0,MRW,Mersey River Watershed
1,PLW,Peskowesk Lake Watershed
2,MDB,Kejimkujik Lake - Minard's Bay
3,NWI,Kejimkujik Lake - Norway Island
4,EAR,Kejimkujik Lake - Eastern Run


In [16]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/Kejimkujik_NP_Freshwater_Brook_Trout_1994-2018_data_1.csv')

df = df[['Watershed', 'Fishing Location', 'Year']]
df = df[~df['Fishing Location'].isin(['na', 'FF', 'O', '0', '1', 'Y', 'N'])]
df.rename(columns={'Year': 'year'}, inplace=True)

df_2 = pd.merge(df, df_dict, how='left')

print(len(df_2))

df_2['latitude'], df_2['longitude'] = 'TO_GET', 'TO_GET'
df_2['salvelinus_fontinalis'] = 1
df_2.rename(columns = {'location_name': 'waterbody_name'}, inplace=True)
df_2.head()

1301


Unnamed: 0,Watershed,Fishing Location,year,waterbody_name,latitude,longitude,salvelinus_fontinalis
0,MRW,EEW,1994,Mersey River - Eel Weir,TO_GET,TO_GET,1
1,MRW,EEW,1994,Mersey River - Eel Weir,TO_GET,TO_GET,1
2,MRW,EEW,1994,Mersey River - Eel Weir,TO_GET,TO_GET,1
3,MRW,HMK,1994,,TO_GET,TO_GET,1
4,MRW,EEW,1994,Mersey River - Eel Weir,TO_GET,TO_GET,1


In [17]:
print(len(df_2))
df_2.drop_duplicates(inplace=True)
print(len(df_2))

display(df_2.sample(5))
df_2.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/brook_trout_NO_LAT_LONG_1994_2017_occurence.csv', 
          index=False)

1301
123


Unnamed: 0,Watershed,Fishing Location,year,waterbody_name,latitude,longitude,salvelinus_fontinalis
1127,MRW,MAB,2018,,TO_GET,TO_GET,1
664,MRW,FLW,2011,,TO_GET,TO_GET,1
617,MRW,MAB,2011,,TO_GET,TO_GET,1
710,PLW,PAB,2016,,TO_GET,TO_GET,1
1283,MRW,FLW,2018,,TO_GET,TO_GET,1


### Canadian Field Minas Basin

In [18]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Used/canadian_field_minas_basin.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'eventDate', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'date_year', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                   'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['eventDate', 'class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
df.head()

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long
0,menidia_menidia,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36
1,alosa_sapidissima,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36
2,cyclopterus_lumpus,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36
3,urophycis_tenuis,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36
4,merluccius_bilinearis,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36


In [19]:
for col in tqdm(df['scientificName'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

100%|██████████| 19/19 [00:00<00:00, 249.70it/s]


Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long,menidia_menidia,alosa_sapidissima,cyclopterus_lumpus,urophycis_tenuis,...,lophius_americanus,pleuronectes_putnami,syngnathus_fuscus,pseudopleuronectes_americanus,pholis_gunnellus,myoxocephalus_aenaeus,microgadus_tomcod,myoxocephalus_scorpius,enchelyopus_cimbrius,zoarces_americanus
0,menidia_menidia,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,alosa_sapidissima,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,cyclopterus_lumpus,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,urophycis_tenuis,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,merluccius_bilinearis,-64.36,45.15,"Minas Basin, Bay of Fundy",1969,45.15_-64.36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

# display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/canadian_field_minas_basin_1969_occurence.csv', 
          index=False)

38
1


### Bay of Fundy

In [21]:
# Bay of Fundy

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Used/bay_of_fundy.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'eventDate', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'date_year', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                   'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['eventDate', 'class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
df['year'] = 2004

df.head()

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long
17,nezumia_aequalis,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806
29,melanogrammus_aeglefinus,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806
40,coregonus_huntsmani,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806
67,acipenser_brevirostrum,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806
73,lopholatilus_chamaeleonticeps,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806


In [22]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,longitude,latitude,waterbody_name,year,lat_long,nezumia_aequalis,melanogrammus_aeglefinus,coregonus_huntsmani,acipenser_brevirostrum,...,stephanolepis_hispidus,merluccius_bilinearis,ulvaria_subbifurcata,lycenchelys_verrillii,reinhardtius_hippoglossoides,fistularia_tabacaria,lumpenus_lampretaeformis,monacanthus_ciliatus,clupea_harengus,cryptacanthodes_maculatus
17,nezumia_aequalis,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
29,melanogrammus_aeglefinus,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
40,coregonus_huntsmani,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
67,acipenser_brevirostrum,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
73,lopholatilus_chamaeleonticeps,-65.806016,44.979852,Bay of Fundy,2004,44.98_-65.806,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

# display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/bay_of_fundy_2004_occurence.csv', 
          index=False)

101
1


### Cape Breton

In [28]:
# Cape Breton

df_location = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Used/Cape_Breton_Highlands_NP_Freshwater_Lake_Fish_Health_2008-2016_data_2.csv')
df_location['lat'], df_location['long'] = utm.to_latlon(df_location['Easting UTM'], df_location['Northing UTM'], 20, 'U')
df_location.drop(columns=['Waterbody Name', 'Easting UTM', 'Northing UTM', 'UTM Zone'], inplace=True)
df_location.rename(columns={'Site Identification Station Name': 'Site Identification Code'}, inplace=True)

df_location['lat_long'] = df_location['lat'].round(3).astype(str) + '_' + df_location['long'].round(3).astype(str)

df_location['site_code_number'] = df_location['Site Identification Code'].str.extract('(\d+)')
df_location['site_code_number'] = df_location['site_code_number'].str.zfill(2)
df_location['site_code_letter'] = df_location['Site Identification Code'].str.extract('([a-zA-Z ]+)')
df_location['site_code_letter'] = df_location['site_code_letter'].str.rstrip()

df_location['Site Identification Code'] = df_location['site_code_letter'] + '_' + df_location['site_code_number']
df_location.drop(columns=['site_code_number', 'site_code_letter'], inplace=True)

df_location.head()

Unnamed: 0,Site Identification Code,lat,long,lat_long
0,BL_01,46.740792,-60.808548,46.741_-60.809
1,BL_02,46.740189,-60.809018,46.74_-60.809
2,BL_03,46.739637,-60.808844,46.74_-60.809
3,BL_04,46.739246,-60.808676,46.739_-60.809
4,BL_05,46.739002,-60.807665,46.739_-60.808


In [31]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/Cape_Breton_Highlands_NP_Freshwater_Lake_Fish_Health_2008-2016_data_1.csv')

df = df[['Water Body', 'Site Identification Code', 'Year', 'Species']]
df.dropna(subset=['Species'], inplace=True)
df = df[df['Species'].isin(['Anguilla rostrata', 'Salvelinus fontinalis', 'Morone americana', 'Alosa pseudoharengus', 
                            'Pungitius pungitius', 'Gasterosteus aculeatus', 'Fundulus diaphanus'])]
df['Species'] = df['Species'].str.lower().str.replace(' ', '_')
df.rename(columns={'Year': 'year'}, inplace=True)
df.replace('BCM')

df['site_code_number'] = df['Site Identification Code'].str.extract('(\d+)')
df['site_code_number'] = df['site_code_number'].str.zfill(2)
df['site_code_letter'] = df['Site Identification Code'].str.extract('([a-zA-Z ]+)')
df['site_code_letter'] = df['site_code_letter'].str.upper().str.rstrip()
df['site_code_letter'] = df['site_code_letter'].replace('BCM', 'BCML')

df['Site Identification Code'] = df['site_code_letter'] + '_' + df['site_code_number']
df.drop(columns=['site_code_number', 'site_code_letter'], inplace=True)

print(len(df))
df_2 = pd.merge(df, df_location, how='left')
df_2.rename(columns = {'lat': 'latitude', 'long': 'longitude', 'Water Body': 'waterbody_name'}, inplace=True)
print(len(df_2))

df_2.head()

12402
12402


Unnamed: 0,waterbody_name,Site Identification Code,year,Species,latitude,longitude,lat_long
0,Benjies Lake,BL_02,2010,salvelinus_fontinalis,46.740189,-60.809018,46.74_-60.809
1,Benjies Lake,BL_05,2010,salvelinus_fontinalis,46.739002,-60.807665,46.739_-60.808
2,Benjies Lake,BL_06,2010,salvelinus_fontinalis,46.738918,-60.806556,46.739_-60.807
3,Benjies Lake,BL_07,2010,salvelinus_fontinalis,46.73962,-60.806567,46.74_-60.807
4,Benjies Lake,BL_08,2010,salvelinus_fontinalis,46.74014,-60.806951,46.74_-60.807


In [32]:
for col in df_2['Species'].unique():
    df_2[col] = np.where(df_2.isin([col]).any(1), 1, np.nan)
    df_2[col] = df_2[col].fillna(df_2.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df_2.head())

Unnamed: 0,waterbody_name,Site Identification Code,year,Species,latitude,longitude,lat_long,salvelinus_fontinalis,anguilla_rostrata,fundulus_diaphanus,gasterosteus_aculeatus,pungitius_pungitius,alosa_pseudoharengus,morone_americana
0,Benjies Lake,BL_02,2010,salvelinus_fontinalis,46.740189,-60.809018,46.74_-60.809,1.0,,,,,,
1,Benjies Lake,BL_05,2010,salvelinus_fontinalis,46.739002,-60.807665,46.739_-60.808,1.0,,,,,,
2,Benjies Lake,BL_06,2010,salvelinus_fontinalis,46.738918,-60.806556,46.739_-60.807,1.0,,,,,,
3,Benjies Lake,BL_07,2010,salvelinus_fontinalis,46.73962,-60.806567,46.74_-60.807,1.0,,,,,,
4,Benjies Lake,BL_08,2010,salvelinus_fontinalis,46.74014,-60.806951,46.74_-60.807,1.0,,,,,,


In [33]:
print(len(df_2))
df_2.drop(columns=['Species', 'Site Identification Code'], inplace=True)
df_2.drop_duplicates(inplace=True)
print(len(df_2))

display(df_2.sample(5))
df_2.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/Cape_Breton_2010_2016_occurence.csv', 
          index=False)

12402
224


Unnamed: 0,waterbody_name,year,latitude,longitude,lat_long,salvelinus_fontinalis,anguilla_rostrata,fundulus_diaphanus,gasterosteus_aculeatus,pungitius_pungitius,alosa_pseudoharengus,morone_americana
11934,Warren Lake,2009,46.71551,-60.392362,46.716_-60.392,1.0,1.0,,,,1.0,
10600,Freshwater Lake,2013,46.647416,-60.399023,46.647_-60.399,,1.0,1.0,1.0,1.0,,
12254,Warren Lake,2014,46.710798,-60.389004,46.711_-60.389,1.0,,,,,,
35,Branch Pond,2011,46.740755,-60.456337,46.741_-60.456,1.0,,,,,,
70,French Lake,2010,46.728045,-60.863562,46.728_-60.864,,,1.0,,,,


# Nova Scotia Hatchery stocking

In [34]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/Nova_Scotia_Fish_Hatchery_Stocking_Records.csv')

df['lat'], df['long'] = utm.to_latlon(df['EASTING'], df['NORTHING'], 20, 'UTM')
df['year'] = pd.DatetimeIndex(pd.to_datetime(df['STOCKING DATE'])).year

df = df[['NAME', 'year', 'lat', 'long', 'STOCK']]
df['STOCK'] = df['STOCK'].str.lower().str.rstrip().str.replace(' ', '_')
df['lat_long'] = df['lat'].round(3).astype(str) + '_' + df['long'].round(3).astype(str)
df.replace(dict_species_name, inplace=True)
df.rename(columns = {'lat': 'latitude', 'long': 'longitude', 'NAME': 'waterbody_name'}, inplace=True)

df.head()

Unnamed: 0,waterbody_name,year,latitude,longitude,STOCK,lat_long
0,ANNAPOLIS RIVER BRICKTON BRIDGE,1990,44.90639,-65.121983,salvelinus_fontinalis,44.906_-65.122
1,ANNAPOLIS RIVER BRIDGETOWN,2002,44.838587,-65.291905,salvelinus_fontinalis,44.839_-65.292
2,ANNAPOLIS RIVER BRIDGETOWN,2006,44.838587,-65.291905,salvelinus_fontinalis,44.839_-65.292
3,ANNAPOLIS RIVER BRIDGETOWN,2003,44.838587,-65.291905,salvelinus_fontinalis,44.839_-65.292
4,ANNAPOLIS RIVER BRIDGETOWN,2005,44.838587,-65.291905,salvelinus_fontinalis,44.839_-65.292


In [35]:
for col in df['STOCK'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.sample(5))

Unnamed: 0,waterbody_name,year,latitude,longitude,STOCK,lat_long,salvelinus_fontinalis,salmo_trutta,salmo_trutta_x_salvelinus_fontinalis,oncorhynchus_mykiss,salmo_salar
11288,BUTLERS,1995,43.925298,-66.123604,salvelinus_fontinalis,43.925_-66.124,1.0,,,,
1594,COLLEGE,2014,45.230637,-62.795945,salvelinus_fontinalis,45.231_-62.796,1.0,,,,
7080,MACMULLENS,2004,45.777464,-60.315755,salvelinus_fontinalis,45.777_-60.316,1.0,,,,
9192,SHAW,2015,45.543163,-61.008911,salvelinus_fontinalis,45.543_-61.009,1.0,1.0,,,
3011,SHAW LITTLE,1993,44.954582,-63.100637,salvelinus_fontinalis,44.955_-63.101,1.0,,,,


In [36]:
df['year'].max()

2016

In [37]:
print(len(df))
df.drop(columns=['STOCK'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/hatchery_stocking_1976_2016_occurence.csv', 
          index=False)

12435
8105


Unnamed: 0,waterbody_name,year,latitude,longitude,lat_long,salvelinus_fontinalis,salmo_trutta,salmo_trutta_x_salvelinus_fontinalis,oncorhynchus_mykiss,salmo_salar
10500,FOX POINT,1977,44.606851,-64.084503,44.607_-64.085,1.0,,,,
6439,FIVE RIVERS (highway),1976,43.980938,-64.766229,43.981_-64.766,1.0,,,,
5306,SAWLER,1993,44.646609,-64.068938,44.647_-64.069,1.0,,,,
9428,ARMSTRONG,1982,44.774284,-64.740427,44.774_-64.74,1.0,,,,
6963,FORREST,2006,45.487959,-60.946897,45.488_-60.947,1.0,,,,


# Nova Scotia Freshwater Species distribution

In [38]:
# Nova Scotia Freshwater Species distribution

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Used/Nova_Scotia_Freshwater_Fish_Species_Distribution_Records.csv')

df['lat'], df['long'] = utm.to_latlon(df['EASTING'], df['NORTHING'], 20, 'UTM')
df['year'] = pd.DatetimeIndex(pd.to_datetime(df['CAPTURE DATE'])).year

df = df[['NAME', 'year', 'lat', 'long', 'SPECIES COMMON NAME']]
df['SPECIES COMMON NAME'] = df['SPECIES COMMON NAME'].str.lower().str.rstrip().str.replace(' ', '_')
df['lat_long'] = df['lat'].round(3).astype(str) + '_' + df['long'].round(3).astype(str)
df.replace(dict_species_name, inplace=True)
df.rename(columns = {'lat': 'latitude', 'long': 'longitude', 'NAME': 'waterbody_name'}, inplace=True)

df.head()

Unnamed: 0,waterbody_name,year,latitude,longitude,SPECIES COMMON NAME,lat_long
0,ALMA,1984,44.61747,-65.113344,salvelinus_fontinalis,44.617_-65.113
1,ALMA,1984,44.61747,-65.113344,morone_americana,44.617_-65.113
2,ALMA,1984,44.61747,-65.113344,perca_flavescens,44.617_-65.113
3,ALMA,1984,44.61747,-65.113344,ameiurus_nebulosus,44.617_-65.113
4,ALMA,1984,44.61747,-65.113344,anguilla_rostrata,44.617_-65.113


In [39]:
for col in df['SPECIES COMMON NAME'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.sample(5))

Unnamed: 0,waterbody_name,year,latitude,longitude,SPECIES COMMON NAME,lat_long,salvelinus_fontinalis,morone_americana,perca_flavescens,ameiurus_nebulosus,...,esox_niger,apeltes_quadracus,microgadus_tomcod,rhinichthys_atratulus,alosa_sapidissima,carassius_auratus,chrosomus_eos,semotilus_corporalis,coregonus_huntsmani,salvelinus_namaycush
3341,DAUPHINEES MILL,1986,44.661949,-64.112296,catostomus_commersonii,44.662_-64.112,1.0,,,,...,,,,,,,,,,
1779,BEAR,1963,44.944748,-62.811221,ameiurus_nebulosus,44.945_-62.811,1.0,,,1.0,...,,,,,,,,,,
3268,CARD,1990,44.739218,-64.281361,couesius_plumbeus,44.739_-64.281,1.0,,,,...,,,,,,,,,,
4053,DUNN (EAST),1964,44.265511,-65.283957,notemigonus_crysoleucas,44.266_-65.284,1.0,1.0,1.0,1.0,...,,,,,,,,,,
2680,TEN MILE,1991,45.141837,-62.702266,anguilla_rostrata,45.142_-62.702,1.0,1.0,1.0,,...,,,,,,,,,,


In [40]:
print(len(df))
df.drop(columns=['SPECIES COMMON NAME'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/freshwater_fish_distribution_1942_2018_occurence.csv', 
          index=False)

5110
1344


Unnamed: 0,waterbody_name,year,latitude,longitude,lat_long,salvelinus_fontinalis,morone_americana,perca_flavescens,ameiurus_nebulosus,anguilla_rostrata,...,esox_niger,apeltes_quadracus,microgadus_tomcod,rhinichthys_atratulus,alosa_sapidissima,carassius_auratus,chrosomus_eos,semotilus_corporalis,coregonus_huntsmani,salvelinus_namaycush
3401,HANNIGAR,1985,44.628883,-64.333367,44.629_-64.333,1.0,,1.0,,,...,,,,,,,,,,
3689,SEVEN MILE,1984,44.444324,-64.759264,44.444_-64.759,,1.0,1.0,1.0,,...,,,,,,,,,,
5012,RODNEYS,1981,43.914103,-66.1379,43.914_-66.138,,1.0,,1.0,,...,,,,,,,,,,
1918,DOLLAR,1975,44.915278,-63.323914,44.915_-63.324,1.0,,,,1.0,...,,,,,,,,,,
3860,FISH,1995,45.357277,-62.681438,45.357_-62.681,1.0,,,1.0,,...,,,,,,,,,,


# Salmonid Cape Breton

In [41]:
df_location = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/Cape_Breton_Highlands_NP_Freshwater_Salmonid_Distribution_2012-2016_data_2.csv')
df_location = df_location[['River Name', 'Site Identification Name', 'Year', 'UTM at start', 'UTM at end']]
df_location.dropna(subset=['UTM at start'], inplace=True)

df_location['easting'] = np.where(df_location['UTM at start'].str.contains('\n'), 
                                  df_location['UTM at start'].str.split('\n').str.get(0), 
                                  df_location['UTM at start'].str.split(' ').str.get(0)).astype(int)
df_location['northing'] = np.where(df_location['UTM at start'].str.contains('\n'), 
                                  df_location['UTM at start'].str.split('\n').str.get(1), 
                                  df_location['UTM at start'].str.split(' ').str.get(1)).astype(int)

df_location['lat'], df_location['long'] = utm.to_latlon(df_location['easting'], df_location['northing'], 20, 'UTM')
df_location.drop(columns=['UTM at start', 'UTM at end', 'easting', 'northing'], inplace=True)
df_location['location'] = (df_location['River Name'] + '_' + df_location['Site Identification Name'])

df_location = df_location[['location', 'lat', 'long']]
df_location.drop_duplicates(inplace=True)
df['site_description'] = 'river'

df_location

Unnamed: 0,location,lat,long
0,Cheticamp_Robert's Brook,46.644882,-60.947702
1,Cheticamp_Fairbault Brook,46.630563,-60.924754
2,Cheticamp_behind warden station,46.64558,-60.952524
3,Clyburn_by bridge,46.660404,-60.409785
4,Clyburn_CLY04,46.656369,-60.458574
5,Clyburn_CLY05,46.65445,-60.415938
6,Clyburn_CLY04,46.656553,-60.458749
7,Clyburn_Slatey,46.660337,-60.49691
8,Dundas_Dundas Brook,46.706748,-60.421906
9,Neil Brook_1,46.811357,-60.333841


In [43]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/To_look_at/Cape_Breton_Highlands_NP_Freshwater_Salmonid_Distribution_2012-2016_data_1.csv')

df['location'] = (df['River Name'] + '_' + df['Site Identification Name'] )
df = df[['location', 'Year', 'Species']]
df['Species'] = df['Species'].str.lower().str.replace(' ', '_')
df.replace(dict_species_name, inplace=True)
df.replace({'Clyburn_CLY5': 'Clyburn_CLY05', 'Clyburn_CLY4': 'Clyburn_CLY04', 
            'Warren_Warren Brook': 'Warren Brook_Warren Brook'}, inplace=True)

df.dropna(inplace=True)
df = df[~df['Species'].isin(['fish'])]
df.drop_duplicates(inplace=True)

df_2 = pd.merge(df, df_location, how='left')
df_2.dropna(subset = ['lat'], inplace=True)

df_2['lat_long'] = df_2['lat'].round(3).astype(str) + '_' + df_2['long'].round(3).astype(str)
df_2.rename(columns = {'location': 'waterbody_name', 'lat': 'latitude', 'long': 'longitude', 'Year': 'year'}, inplace=True)
df_2['site_description'] = 'river'
df_2.head()

Unnamed: 0,waterbody_name,year,Species,latitude,longitude,lat_long,site_description
0,Neil Brook_1,2012,anguilla_rostrata,46.811357,-60.333841,46.811_-60.334,river
1,Neil Brook_1,2012,salvelinus_fontinalis,46.811357,-60.333841,46.811_-60.334,river
2,Clyburn_by bridge,2012,anguilla_rostrata,46.660404,-60.409785,46.66_-60.41,river
3,Clyburn_by bridge,2012,salmo_salar,46.660404,-60.409785,46.66_-60.41,river
4,Cheticamp_behind warden station,2013,anguilla_rostrata,46.64558,-60.952524,46.646_-60.953,river


In [44]:
for col in df_2['Species'].unique():
    df_2[col] = np.where(df_2.isin([col]).any(1), 1, np.nan)
    df_2[col] = df_2[col].fillna(df_2.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df_2.sample(5))

Unnamed: 0,waterbody_name,year,Species,latitude,longitude,lat_long,site_description,anguilla_rostrata,salvelinus_fontinalis,salmo_salar,salmo_trutta,gasterosteus_aculeatus,pungitius_pungitius
18,Cheticamp_behind warden station,2015,salmo_salar,46.64558,-60.952524,46.646_-60.953,river,,,1.0,1.0,,
28,Neil Brook_1,2016,anguilla_rostrata,46.811357,-60.333841,46.811_-60.334,river,1.0,1.0,,,,
21,Clyburn_CLY04,2015,salmo_salar,46.656553,-60.458749,46.657_-60.459,river,1.0,,1.0,,,
32,North Aspy_NA1,2016,salmo_salar,46.813747,-60.627412,46.814_-60.627,river,1.0,,1.0,,1.0,
40,Dundas_Dundas Brook,2016,salmo_salar,46.706748,-60.421906,46.707_-60.422,river,,1.0,1.0,,,


In [45]:
print(len(df_2))
df_2.drop(columns=['Species'], inplace=True)
df_2.drop_duplicates(inplace=True)
print(len(df_2))

display(df_2.sample(5))
df_2.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Nova_Scotia/Processed/salmonid_cape_breton_2012_2016_occurence.csv', 
          index=False)

43
22


Unnamed: 0,waterbody_name,year,latitude,longitude,lat_long,site_description,anguilla_rostrata,salvelinus_fontinalis,salmo_salar,salmo_trutta,gasterosteus_aculeatus,pungitius_pungitius
4,Cheticamp_behind warden station,2013,46.64558,-60.952524,46.646_-60.953,river,1.0,,1.0,,,
39,Dundas_Dundas Brook,2016,46.706748,-60.421906,46.707_-60.422,river,,1.0,1.0,,,
18,Cheticamp_behind warden station,2015,46.64558,-60.952524,46.646_-60.953,river,,,1.0,1.0,,
35,Cheticamp_Fairbault Brook,2016,46.630563,-60.924754,46.631_-60.925,river,,,1.0,,,
24,Clyburn_CLY05,2015,46.65445,-60.415938,46.654_-60.416,river,1.0,1.0,1.0,,,


# acer temporalpatterns


In [46]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/To_look_at/acer_temporal_patterns.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                   'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
# df['waterBody'] = 'Bay_of_Fundy'

df.head()

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
0,microgadus_tomcod,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
1,alosa_sapidissima,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175
2,scomber_scombrus,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
3,alosa,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
4,alosa_sapidissima,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106


In [47]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long,microgadus_tomcod,alosa_sapidissima,scomber_scombrus,alosa,...,morone_americana,alosa_pseudoharengus,salvelinus_fontinalis,pleuronectes_putnami,lophius_americanus,anguilla_rostrata,urophycis_chuss,syngnathus_fuscus,tautogolabrus_adspersus,salmo_trutta
0,microgadus_tomcod,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
1,alosa_sapidissima,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,
2,scomber_scombrus,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
3,alosa,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
4,alosa_sapidissima,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0


In [48]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head())
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/acer_temporal_2013_occurence.csv', index=True)

398
2


Unnamed: 0,year,longitude,latitude,waterbody_name,lat_long,microgadus_tomcod,alosa_sapidissima,scomber_scombrus,alosa,osmerus_mordax_mordax,...,morone_americana,alosa_pseudoharengus,salvelinus_fontinalis,pleuronectes_putnami,lophius_americanus,anguilla_rostrata,urophycis_chuss,syngnathus_fuscus,tautogolabrus_adspersus,salmo_trutta
0,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
1,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,


# Atlantic Museum

In [49]:
# Atlantic museum

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/To_look_at/atlantic_museum.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year'}, inplace=True)
df.drop(columns=['class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)

print(len(df))
df = df[~df['scientificName'].isin(['actinopterygii'])]
df.drop_duplicates(inplace=True)
print(len(df))

df.head()

  interactivity=interactivity, compiler=compiler, result=result)


106331
76788


Unnamed: 0,scientificName,year,longitude,latitude,lat_long
1,teleostei,1987.0,-66.4186,43.9175,43.918_-66.419
2,lycodes,1960.0,-55.0333,54.866667,54.867_-55.033
3,hippoglossoides_platessoides,1985.0,-65.8283,42.751945,42.752_-65.828
4,scomber_scombrus,1983.0,-66.68,42.496944,42.497_-66.68
5,clupea_harengus,1978.0,-66.2333,43.638332,43.638_-66.233


In [None]:
%%time
print(df['year'].min(), df['year'].max())
for col in tqdm(df['scientificName'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

  0%|          | 2/748 [00:00<01:01, 12.22it/s]

1929.0 2015.0


 10%|█         | 78/748 [00:44<13:30,  1.21s/it]

In [None]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head())
# df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/atlantic_museum_1929_2015_occurence.csv', index=True)