In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as graph
import seaborn as sns

import cartopy.crs as ccrs

import statsmodels

import os
from tqdm import tqdm, trange

from convertbng.util import convert_bng, convert_lonlat
import utm

In [None]:
import re

def dms2dd(s):
    # example: s = """0°51'56.29"S"""
    degrees, minutes, seconds, direction = re.split('[°\'"]+', s)
    dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
    if direction in ('S','W'):
        dd*= -1
    return dd

In [None]:
# code and species dictionary

code_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Alberta/To_look_at/Banff_NP_Freshwater_Lake_Fish_Index_2017_data_dictionary.csv')
# code_df = code_df.iloc[:-6]
code_df.rename(columns={'Data_Value_Valeur_de_la_donnée': 'code_name', 
                        'Value_Description_EN_Description_de_la_valeur': 'common_name'}, inplace=True)
code_df = code_df[['code_name', 'common_name']]
code_df['common_name'] = code_df['common_name'].str.lower().str.replace(' ', '_')
display(code_df.sample(5))

dict_code_name = code_df.set_index('code_name').to_dict()['common_name']

In [None]:
new_codes = {'LNDC': 'longnose_dace', 'BNTR': 'brown_trout', 'RNTR1': 'rainbow_trout', 'CTTR1': 'cutthroat_trout', 
             'WHSC': 'white_sucker', 'BLTRCTTR(N&I)': 'bull_trout_x_cutthroat_trout', 'CTTR(hybrids)': 'cutthroat_trout', 
             'CTTRhybrids': 'cutthroat_trout', 'SUCK': 'white_sucker'}

dict_code_name.update(new_codes)

In [None]:
# Species dictionay

species_name_df = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv')

dict_species_name = species_name_df.set_index('common_name').to_dict()['scientific_name']

In [None]:
species_name_df = pd.DataFrame.from_dict({'common_name': list(dict_species_name.keys()), 
                                        'scientific_name' : list(dict_species_name.values())})
# species_name_df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/species_name_dictionary.csv', index=False)

### Column names dict

In [16]:
# df_col_names.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/merged_col_names_dict.csv', index=False)
df_col_names = pd.read_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/merged_col_names_dict.csv')
dict_col_names = dict(zip(df_col_names['used_col_names'], df_col_names['to_use_col_names']))

# Merge Data

In [9]:
path = '/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Master_compilation/All_files_with_year_lat_long/'

df_list = []
for file in tqdm(os.listdir(path)):
    df_temp = pd.read_csv(f'{path}{file}')
    df_temp.rename(columns=dict_col_names, inplace=True)
    df_temp['latitude'] = df_temp['latitude'].astype(float)
    df_temp['longitude'] = df_temp['longitude'].astype(float)
    df_temp = df_temp.sum(axis=1, level=0)
    df_temp = df_temp.T.drop_duplicates()
    df_temp = df_temp.T

    df_list.append(df_temp)

100%|██████████| 67/67 [26:57<00:00, 24.15s/it]   


In [10]:
%%time
df = pd.concat(df_list, sort=False)

CPU times: user 1min 29s, sys: 52.1 s, total: 2min 21s
Wall time: 2min 32s


In [11]:
# df.to_csv('/mnt/c/Users/imrit/Downloads/Sharma_fish_project/merged_dfs_jan_14.csv', index=False)

In [None]:
df['remove_please']

In [None]:
print(len(df))
df.drop(columns=['remove_please'], inplace=True)
df = df.loc[(df['latitude'] > 12) & (df['longitude'] < -30)]
df.drop_duplicates(inplace=True)
print(len(df))

display(df.head(5))

In [None]:
display(df.head(5))

In [12]:
df_col_names = pd.Series(sorted(df.columns))
df_col_names.to_csv('/mnt/c/Users/imrit/Desktop/merged_cols_check_fish_df.csv', index=False)

  


In [None]:
%%time
print(df['year'].min(), df['year'].max())
for col in tqdm(df['SPECIES_NAME'].unique()):
    df[col] = np.where(df.isin([col]).any(1), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))

display(df.sample(5))  

In [7]:
df_morphologies_list = pd.read_csv('/mnt/c/Users/imrit/Desktop/merged_cols_morphologies.csv', header=None)
df_morphologies_list[0].to_list()

['AirTemp_8110',
 'Area_ha',
 'Chlorophyll-a (mgÂ·m-3)',
 'Conductivity(uScms)',
 'DD5_8110',
 'Depth_Max',
 'Depth_Mn',
 'Dissolved Oxygen (mg/L)',
 'SDF',
 'Sal',
 'Secchi_Su',
 'TDS (mg/L)',
 'Temp',
 'Thermo_Obs',
 'Thermo_Pred',
 'Tide',
 'TotalPhosphorus(ugL)',
 'TrueColour_pctl',
 'Water Bottom Depth (m)',
 'Water Conductivity (ï¿½S/cm)',
 'Water Temperature (C)',
 'Water pH',
 'air_temperature',
 'area_(ha)',
 'conductivity_(us/cm)',
 'island_perimeter_(km)',
 'lake_elevation_(m)',
 'lat_long',
 'latitude',
 'longitude',
 'maximum_depth_(m)',
 'mean_depth',
 'mei',
 'o2',
 'pH',
 'pH_pctl',
 'pLittoral',
 'perimeter_(km)',
 'ph',
 'primary_bassin',
 'secchi_depth_(m)',
 'secondary_bassin',
 'site_description',
 'surface_area_(ha)',
 'surface_water_temp',
 'tds_(mg/l)',
 'tide_stg',
 'waterbody_name',
 'year']

In [None]:
['AirTemp_8110', 'Area_ha', 'Chlorophyll-a (mgÂ·m-3)', 'Conductivity(uScms)', 'DD5_8110', 'Depth_Max', 'Depth_Mn',
 'Dissolved Oxygen (mg/L)', 'SDF', 'Sal', 'Secchi_Su', 'TDS (mg/L)', 'Temp', 'Thermo_Obs', 'Thermo_Pred', 'Tide', 
 'TotalPhosphorus(ugL)', 'TrueColour_pctl', 'Water Bottom Depth (m)', 'Water Conductivity (ï¿½S/cm)', 'Water Temperature (C)', 
 'Water pH', 'air_temperature', 'area_(ha)', 'conductivity_(us/cm)', 'island_perimeter_(km)', 'lake_elevation_(m)', 'lat_long', 
 'latitude', 'longitude', 'maximum_depth_(m)', 'mean_depth', 'mei', 'o2', 'pH', 'pH_pctl', 'pLittoral', 'perimeter_(km)', 'ph', 
 'primary_bassin', 'secchi_depth_(m)', 'secondary_bassin', 'site_description', 'surface_area_(ha)', 'surface_water_temp', 
 'tds_(mg/l)', 'tide_stg', 'waterbody_name', 'year']

## Ontario

In [39]:
path = '/mnt/c/Users/imrit/Downloads/Sharma_fish_project/Master_compilation/Ontario/'

df_list = []
for file in os.listdir(path):
    df = pd.read_csv(f'{path}{file}')
    df_list.append(df)

df = pd.concat(df_list)
df.rename(columns=dict_col_names, inplace=True)
df = df.sum(axis=1, level=0)

display(df.sample(5))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,AirTemp_8110,Area_ha,Conductivity(uScms),DD5_8110,Depth_Max,Depth_Mn,SDF,Secchi_Su,TDS (mg/L),Thermo_Obs,...,semotius_spp,site_description,squalius_cephalus,stizostedion_spp,surface_area_(ha),surface_water_temp,tds_(mg/l),umbra_limi,waterbody_name,year
5175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2CD04,0.0,0.0,19.1,19.0,29.0,0.0,Three Loon L.,1984
2143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4JF26,0.0,0.0,718.3,16.0,68.0,0.0,Fullerton L.,1972
792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5QE01,0.0,0.0,1742.3,24.0,71.0,0.0,Goshawk L.,1981
647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5QA05,0.0,0.0,11.3,19.8,21.0,0.0,Savitsky L.,1970
1160,5.9,1500.0,83.0,1959.0,42.7,15.8,5.303481,4.4,55.278,9.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,Black Donald L.,2009


In [42]:
morphologies_cols = ['AirTemp_8110', 'Area_ha', 'Chlorophyll-a (mgÂ·m-3)', 'Conductivity(uScms)', 'DD5_8110', 'Depth_Max', 
                     'Depth_Mn', 'Dissolved Oxygen (mg/L)', 'SDF', 'Sal', 'Secchi_Su', 'TDS (mg/L)', 'Temp', 'Thermo_Obs', 
                     'Thermo_Pred', 'Tide',  'TotalPhosphorus(ugL)', 'TrueColour_pctl', 'Water Bottom Depth (m)', 
                     'Water Conductivity (ï¿½S/cm)', 'Water Temperature (C)',  'Water pH', 'air_temperature', 'area_(ha)', 
                     'conductivity_(us/cm)', 'island_perimeter_(km)', 'lake_elevation_(m)', 'lat_long',  'latitude', 
                     'longitude', 'maximum_depth_(m)', 'mean_depth', 'mei', 'o2', 'pH', 'pH_pctl', 'pLittoral', 
                     'perimeter_(km)', 'ph',  'primary_bassin', 'secchi_depth_(m)', 'secondary_bassin', 'site_description', 
                     'surface_area_(ha)', 'surface_water_temp', 'tds_(mg/l)', 'tide_stg', 'waterbody_name', 'year', 
                     'Chlorophyll-a (mg·m-3)', 'Water Conductivity (�S/cm)', 'growing_degree_days']

species_cols = [col for col in df.columns if col not in morphologies_cols]

In [43]:
for col in tqdm(species_cols):
    df[col] = df[col].replace(0, np.nan)
    df[col] = np.where(df[col].notnull(), 1, np.nan)
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))

100%|██████████| 152/152 [00:01<00:00, 77.73it/s]


In [44]:
col_vals = [col for col in df.columns if col not in ['secondary_bassin', 'site_description', 'waterbody_name', 'year', 
                                                     'lat_long', 'primary_bassin']]

In [45]:
for col in tqdm(col_vals):
    df[col] = df[col].fillna(df.groupby(['year', 'lat_long'])[col].transform('mean'))

100%|██████████| 184/184 [00:02<00:00, 78.61it/s]


In [46]:
print(len(df))
df = df[df['year'] != 0]
df.drop_duplicates(subset=df.columns.difference(['secondary_bassin', 'site_description', 'waterbody_name', 'primary_bassin']), 
                   inplace=True)
print(len(df))

11290
11281


In [49]:
df.to_csv('/mnt/c/Users/imrit/Desktop/ontario_1957_2017.csv', index=False)

In [48]:
df['year'].max()

2017