In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as graph
import seaborn as sns

import cartopy.crs as ccrs

import statsmodels

import os
from tqdm import tqdm, trange

from convertbng.util import convert_bng, convert_lonlat
import utm

In [4]:
# code and species dictionary

code_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Alberta/To_look_at/Banff_NP_Freshwater_Lake_Fish_Index_2017_data_dictionary.csv')
# code_df = code_df.iloc[:-6]
code_df.rename(columns={'Data_Value_Valeur_de_la_donnée': 'code_name', 
                        'Value_Description_EN_Description_de_la_valeur': 'common_name'}, inplace=True)
code_df = code_df[['code_name', 'common_name']]
code_df['common_name'] = code_df['common_name'].str.lower().str.replace(' ', '_')
display(code_df.sample(5))

dict_code_name = code_df.set_index('code_name').to_dict()['common_name']

Unnamed: 0,code_name,common_name
21,BRMN,brassy_minnow
29,FTMN,fathead_minnow
22,BRST,brook_stickleback
1,ATLS,atlantic_salmon
13,RNTR,rainbow_trout


In [5]:
new_codes = {'LNDC': 'longnose_dace', 'BNTR': 'brown_trout', 'RNTR1': 'rainbow_trout', 'CTTR1': 'cutthroat_trout', 
             'WHSC': 'white_sucker', 'BLTRCTTR(N&I)': 'bull_trout_x_cutthroat_trout', 'CTTR(hybrids)': 'cutthroat_trout', 
             'CTTRhybrids': 'cutthroat_trout', 'SUCK': 'white_sucker'}

dict_code_name.update(new_codes)

In [6]:
# Species dictionay

species_name_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv')

dict_species_name = species_name_df.set_index('common_name').to_dict()['scientific_name']

In [5]:
new_names = {'nine-spine_stickleback': 'pungitius_pungitius'}

dict_species_name.update(new_names)
dict_species_name

{'lake_sturgeon': 'acipenser_fulvescens',
 'green_sturgeon': 'acipenser_medirostris',
 'atlantic_sturgeon': 'acipenser_oxyrinchus',
 'white_sturgeon': 'acipenser_transmontanus',
 'sturgeon': 'acipenseridae_spp',
 'sturgeons': 'acipenseridae_spp',
 'chiselmouth': 'acrocheilus_alutaceus',
 'poachers': 'agonidae_spp',
 'poacher_spp': 'agonidae_spp',
 'northern_spearnose_poacher': 'agonopsis_vulsa',
 'northern_spparnose_poacher': 'agonopsis_vulsa',
 'blueback_herring': 'alosa_aestivalis',
 'skipjack_herring': 'alosa_chrysochloris',
 'alewife': 'alosa_pseudoharengus',
 'gaspereau': 'alosa_pseudoharengus',
 'alewife_(gaspereau)': 'alosa_pseudoharengus',
 'american_shad': 'alosa_sapidissima',
 'allis_shad': 'alosa_spp',
 'rock_bass': 'ambloplites_rupestris',
 'tiger_salamander': 'ambystoma_tigrinum',
 'black_bullhead': 'ameiurus_melas',
 'black_catfish': 'ameiurus_melas',
 'bullhead': 'ameiurus_melas',
 'black_bullhead_x_brown_bullhead': 'ameiurus_melas_x_ameiurus_nebulosus',
 'yellow_bullhea

In [6]:
species_name_df = pd.DataFrame.from_dict({'common_name': list(dict_species_name.keys()), 
                                        'scientific_name' : list(dict_species_name.values())})
# species_name_df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv', index=False)

# Compiling stuff

### Individual Species

In [7]:
path = '/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Individual_species/'
indiv_df = []

for file in os.listdir(path):
    df = pd.read_csv(f'{path}{file}')
    print(df['scientificName'].unique()[0])
    df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'class', 'waterBody']]
    
    df = df[df['class'].isin(['Actinopterygii'])]
    df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                       'waterBody': 'waterbody_name'}, inplace=True)
    df.drop(columns=['class'], inplace=True)
    
    df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
    df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
    
    indiv_df.append(df)

df = pd.concat(indiv_df)

print(len(df))
df = df[~df['scientificName'].isin(['actinopterygii'])]
df.drop_duplicates(inplace=True)
df.dropna(subset=['year'], inplace=True)
print(len(df))

df.head()

Gadus morhua
Lepomis gibbosus
Moxostoma macrolepidotum


  interactivity=interactivity, compiler=compiler, result=result)


Morone americana
Lepisosteus osseus
Ameiurus nebulosus
Carassius auratus
Gadus morhua


  interactivity=interactivity, compiler=compiler, result=result)


Tagged
Xiphias gladius
Anguilla rostrata
Cyprinus carpio
261783
9524


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
0,gadus_morhua,2003.0,-67.34,62.85,Ogac Lake,62.85_-67.34
1,gadus_morhua,2004.0,-67.34,62.85,Ogac Lake,62.85_-67.34
2,gadus_morhua,2003.0,-67.47,65.54,Tariujuarusiq Lake,65.54_-67.47
13,gadus_morhua,2003.0,-68.2,65.78,Qasigialiminiq Lake,65.78_-68.2
17,gadus_morhua,2005.0,-67.34,62.85,Ogac Lake,62.85_-67.34


In [8]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long,gadus_morhua,lepomis_gibbosus,moxostoma_macrolepidotum,morone_americana,lepisosteus_osseus,ameiurus_nebulosus,carassius_auratus,xiphias_gladius,cyprinus_carpio
0,gadus_morhua,2003.0,-67.34,62.85,Ogac Lake,62.85_-67.34,1.0,,,,,,,,
1,gadus_morhua,2004.0,-67.34,62.85,Ogac Lake,62.85_-67.34,1.0,,,,,,,,
2,gadus_morhua,2003.0,-67.47,65.54,Tariujuarusiq Lake,65.54_-67.47,1.0,,,,,,,,
13,gadus_morhua,2003.0,-68.2,65.78,Qasigialiminiq Lake,65.78_-68.2,1.0,,,,,,,,
17,gadus_morhua,2005.0,-67.34,62.85,Ogac Lake,62.85_-67.34,1.0,,,,,,,,


In [9]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.sample(5))
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/indiv_occurence.csv', index=True)

9524
8979


Unnamed: 0,year,longitude,latitude,waterbody_name,lat_long,gadus_morhua,lepomis_gibbosus,moxostoma_macrolepidotum,morone_americana,lepisosteus_osseus,ameiurus_nebulosus,carassius_auratus,xiphias_gladius,cyprinus_carpio
155764,1964.0,-50.5,45.25,Atlantic Ocean,45.25_-50.5,1.0,,,,,,,,
34707,1980.0,-51.5,49.75,Atlantic Ocean,49.75_-51.5,1.0,,,,,,,,
4635,1967.0,13.78,45.64861,,45.649_13.78,,,,,,,,,1.0
5477,1956.0,-52.5,47.75,Atlantic Ocean,47.75_-52.5,1.0,,,,,,,,
26714,1982.0,-50.5,52.25,Atlantic Ocean,52.25_-50.5,1.0,,,,,,,,


### Acer FIsh

In [10]:
# acer temporalpatterns

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/To_look_at/acer_temporal_patterns.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'waterBody', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                   'waterBody': 'waterbody_name'}, inplace=True)
df.drop(columns=['class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
# df['waterBody'] = 'Bay_of_Fundy'

df.head()

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
0,microgadus_tomcod,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
1,alosa_sapidissima,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175
2,scomber_scombrus,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
3,alosa,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106
4,alosa_sapidissima,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106


In [11]:
for col in df['scientificName'].unique():
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long,microgadus_tomcod,alosa_sapidissima,scomber_scombrus,alosa,...,morone_americana,alosa_pseudoharengus,salvelinus_fontinalis,pleuronectes_putnami,lophius_americanus,anguilla_rostrata,urophycis_chuss,syngnathus_fuscus,tautogolabrus_adspersus,salmo_trutta
0,microgadus_tomcod,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
1,alosa_sapidissima,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,
2,scomber_scombrus,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
3,alosa,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
4,alosa_sapidissima,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0


In [12]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head())
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/acer_temporal_2013_occurence.csv', index=True)

398
2


Unnamed: 0,year,longitude,latitude,waterbody_name,lat_long,microgadus_tomcod,alosa_sapidissima,scomber_scombrus,alosa,osmerus_mordax_mordax,...,morone_americana,alosa_pseudoharengus,salvelinus_fontinalis,pleuronectes_putnami,lophius_americanus,anguilla_rostrata,urophycis_chuss,syngnathus_fuscus,tautogolabrus_adspersus,salmo_trutta
0,2013,-64.10588,45.39864,Minas Basin>Bay of Fundy,45.399_-64.106,1.0,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,,1.0,1.0,,1.0
1,2013,-64.174962,45.182148,Minas Basin>Bay of Fundy,45.182_-64.175,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,


### Atlantic Museum

In [None]:
# Atlantic museum

df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/To_look_at/atlantic_museum.csv')

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'class']]
df = df[df['class'].isin(['Actinopterygii'])]
df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year'}, inplace=True)
df.drop(columns=['class'], inplace=True)

df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)

print(len(df))
df = df[~df['scientificName'].isin(['actinopterygii'])]
df.drop_duplicates(inplace=True)
print(len(df))

df.head()

In [None]:
print(df['year'].min(), df['year'].max())

In [None]:
%%time
for col in tqdm(df['scientificName'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

In [None]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head())
# df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/atlantic_museum_1929_2015_occurence.csv', 
#           index=True)

### DFO

In [None]:
path = '/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/DFO/'
indiv_df = []

for file in tqdm(os.listdir(path)):
    df = pd.read_csv(f'{path}{file}')
    print(len(df))
    df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'class']]
    
    df = df[df['class'].isin(['Actinopterygii'])]
    df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year'}, inplace=True)
    df.drop(columns=['class'], inplace=True)
    
    df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
    df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
    
    indiv_df.append(df)

df = pd.concat(indiv_df)

print('ALL: 'len(df))
df = df[~df['scientificName'].isin(['actinopterygii'])]
df.drop_duplicates(inplace=True)
df.dropna(subset=['year'], inplace=True)
print(len(df))

df.head()

In [None]:
%%time
for col in tqdm(df['scientificName'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

In [None]:
print(len(df))
df.drop(columns=['scientificName'], inplace=True)
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head())
df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Processed/dfo_data_1953_2019_occurence.csv', 
          index=True)

### Combining rest, can't anymore

In [7]:
path = '/mnt/c/Users/imrit/Downloads/Sharma_fish_project/OBIS/Give_up/'

indiv_df = []
for file in tqdm(os.listdir(path)):
    df = pd.read_csv(f'{path}{file}')
    print(f"{file.split('.csv')[0]} and # Records: {len(df)}")
    df = df[['scientificName', 'year', 'decimalLongitude', 'decimalLatitude', 'class', 'waterBody']]
    
    df = df[df['class'].isin(['Actinopterygii'])]
    df.rename(columns={'decimalLongitude': 'longitude', 'decimalLatitude': 'latitude', 'date_year': 'year', 
                       'waterBody': 'waterbody_name'}, inplace=True)
    df.drop(columns=['class'], inplace=True)
    
    df['scientificName'] = df['scientificName'].str.lower().str.replace(' ', '_')
    df['lat_long'] = df['latitude'].round(3).astype(str) + '_' + df['longitude'].round(3).astype(str)
    
    indiv_df.append(df)

df = pd.concat(indiv_df)

print('ALL: ', len(df))
df = df[~df['scientificName'].isin(['actinopterygii'])]
df.drop_duplicates(inplace=True)
df.dropna(subset=['year'], inplace=True)
print(len(df))

df.head()

  interactivity=interactivity, compiler=compiler, result=result)


atlantic_museum and # Records: 106331


  interactivity=interactivity, compiler=compiler, result=result)


DFO_gulf_region_groundfish and # Records: 74792


  interactivity=interactivity, compiler=compiler, result=result)


DFO_maritimes_groundfish_tagging and # Records: 271456


  interactivity=interactivity, compiler=compiler, result=result)


DFO_maritimes_research_vessel and # Records: 157303


  interactivity=interactivity, compiler=compiler, result=result)


DFO_NL_trawl and # Records: 540576


 33%|███▎      | 6/18 [01:27<02:53, 14.44s/it]

DFO_quebec_region_multispecies and # Records: 34401


 39%|███▉      | 7/18 [01:29<01:58, 10.73s/it]

DFO_stock_assessment and # Records: 38720


  interactivity=interactivity, compiler=compiler, result=result)


ECNASAP and # Records: 456665


 56%|█████▌    | 10/18 [01:52<01:19,  9.96s/it]

ichthyoplankton and # Records: 85
NAFO_cod_fisheries and # Records: 706


 61%|██████    | 11/18 [01:52<00:49,  7.02s/it]

NAFO_environmental_surveys and # Records: 1642


 67%|██████▋   | 12/18 [01:52<00:29,  5.00s/it]

NAFO_historical_annual_fisheries and # Records: 614
Northern_Gulf_st_lawrence and # Records: 26811


 83%|████████▎ | 15/18 [01:54<00:08,  2.80s/it]

NSIS_malpeque_bay and # Records: 32
NSIS_salt_marsh and # Records: 5
nuseds_salmon and # Records: 162707


  interactivity=interactivity, compiler=compiler, result=result)


pacific_species and # Records: 95450


100%|██████████| 18/18 [02:08<00:00,  7.13s/it]


shark and # Records: 1540
ALL:  1637740
1315020


Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
1,teleostei,1987.0,-66.4186,43.9175,,43.918_-66.419
2,lycodes,1960.0,-55.0333,54.866667,,54.867_-55.033
3,hippoglossoides_platessoides,1985.0,-65.8283,42.751945,,42.752_-65.828
4,scomber_scombrus,1983.0,-66.68,42.496944,,42.497_-66.68
5,clupea_harengus,1978.0,-66.2333,43.638332,,43.638_-66.233


In [8]:
df['scientificName'].replace(dict_species_name, inplace=True)
display(df.head())

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
1,teleostei,1987.0,-66.4186,43.9175,,43.918_-66.419
2,lycodes,1960.0,-55.0333,54.866667,,54.867_-55.033
3,hippoglossoides_platessoides,1985.0,-65.8283,42.751945,,42.752_-65.828
4,scomber_scombrus,1983.0,-66.68,42.496944,,42.497_-66.68
5,clupea_harengus,1978.0,-66.2333,43.638332,,43.638_-66.233


In [9]:
# df_species_namessss = pd.Series(sorted(list(df['scientificName'].unique())))
# df_species_namessss.to_csv('/mnt/c/Users/imrit/Desktop/obis_names.csv', index=False)

df_species_namessss = pd.read_csv('/mnt/c/Users/imrit/Desktop/obis_names.csv')
dict_obis_names = dict(zip(df_species_namessss['obis_used_name'], df_species_namessss['obis_dict_names']))

In [10]:
df['scientificName'].replace(dict_obis_names, inplace=True)
df.head()

Unnamed: 0,scientificName,year,longitude,latitude,waterbody_name,lat_long
1,teleostei_spp,1987.0,-66.4186,43.9175,,43.918_-66.419
2,lycodes_spp,1960.0,-55.0333,54.866667,,54.867_-55.033
3,hippoglossoides_platessoides,1985.0,-65.8283,42.751945,,42.752_-65.828
4,scomber_scombrus,1983.0,-66.68,42.496944,,42.497_-66.68
5,clupea_harengus,1978.0,-66.2333,43.638332,,43.638_-66.233


In [12]:
print(len(df['scientificName'].unique()))
print(len(df['waterbody_name'].unique()))

1124
8


In [13]:
df.to_csv('/mnt/c/Users/imrit/Desktop/rest_of_OBIS.csv', index=False)

In [14]:
%%time
for col in tqdm(df['scientificName'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))
    
display(df.head())

  0%|          | 3/1182 [00:07<45:58,  2.34s/it]


KeyboardInterrupt: 

In [21]:
df = pd.read_csv('/mnt/c/Users/imrit/Downloads/FISS_OBSPT_processed_occurence_1894_2020.csv')
df = df[['NEW_WS_CD', 'WTRBDY_TYE', 'GZTTD_NM', 'latitude', 'longitude']]
df.drop_duplicates(inplace=True)
df

Unnamed: 0,NEW_WS_CD,WTRBDY_TYE,GZTTD_NM,latitude,longitude
0,390-137600-40800-00000-0000-0000-000-000-000-0...,Lake,THREE ISLAND LAKE,51.008779,-116.785807
1,128-994100-53400-00000-0000-0000-000-000-000-0...,Lake,MONTE LAKE,50.489991,-119.834209
2,349-434900-13700-77200-0000-0000-000-000-000-0...,Lake,HAHAS LAKE,49.749064,-115.816648
3,230-906800-97600-99700-9080-0000-000-000-000-0...,Lake,BOOT LAKE,54.312938,-122.737491
4,120-246600-51000-34100-0230-0000-000-000-000-0...,Lake,ALLEYNE LAKE,49.926224,-120.568237
...,...,...,...,...,...
50585,300-614500-00000-00000-0000-0000-000-000-000-0...,Lake,VIOLIN LAKE,49.028154,-117.709484
50588,920-384400-18800-00000-0000-0000-000-000-000-0...,Lake,OVERTON LAKE,49.097839,-123.949140
50591,100-385000-98600-22500-0000-0000-000-000-000-0...,Lake,JONES LAKE,52.080607,-121.894670
50735,905-124500-86600-00000-0000-0000-000-000-000-0...,Lake,GRAHAM LAKE,49.510726,-124.750104


In [22]:
df.to_csv('/mnt/c/Users/imrit/Desktop/BC_lake_data.csv', index=False)

In [18]:
len(df['NEW_WS_CD'].unique())

2899