## Transformations used to convert original dataset to match SOPHY

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import geolabel
import sophysql
import sophytaxa
import cartopy.crs as ccrs
import geopandas as gpd
from geopandas import GeoDataFrame
from pandas import DataFrame

In [20]:
con = sqlite3.connect("test.db")
with open('schema.sql', 'r') as sql_file:
    con.executescript(sql_file.read())
con.commit()

In [32]:
pd.read_sql(sql="select * from sample", con=con)

Unnamed: 0,id,source_name,cruise,latitude,longitude,timestamp,front_zone,sector,percent_phaeo,percent_diatom,...,chl_a,salinity,temperature,mld,par,nitrate,nitrite,phosphate,silicate,extra_json
0,1,,PD91-09,-64.83333,-64.05167,1991-11-07 00:36:00,SIZ,,0.000000,0.000603,...,0.397000,33.791,-1.7618,,,,,,,"{""cryptophytes"":0.0,""diatoms"":0.000602825,""hap..."
1,2,,PD91-09,-64.83333,-64.05167,1991-11-07 00:36:00,SIZ,,,,...,0.298000,33.798,-1.7445,,,,,,,"{""notes"":""**HPLC in datazoo file listed as ug\..."
2,3,,PD91-09,-64.83333,-64.05167,1991-11-07 00:36:00,SIZ,,0.000000,0.000000,...,0.265000,33.820,-1.6974,,,,,,,"{""cryptophytes"":0.0,""diatoms"":0.0,""haptophytes..."
3,4,,PD91-09,-64.83333,-64.05167,1991-11-07 00:36:00,SIZ,,,,...,,33.846,-1.6188,,,,,,,"{""part_org_carbon"":123.16,""tot_nitrogen"":17.0}"
4,5,,PD91-09,-64.83333,-64.05167,1991-11-07 00:36:00,SIZ,,0.073917,0.388492,...,0.284000,33.879,-1.4717,,,,,,,"{""cryptophytes"":0.001472699,""diatoms"":0.388492..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49220,49221,,PAL2021,-64.81500,-64.04050,2021-05-04 00:00:00,SIZ,,,,...,0.533521,,,,,,,,,"{""phaeopigments"":0.113480187}"
49221,49222,,PAL2021,-64.81500,-64.04050,2021-05-04 00:00:00,SIZ,,,,...,0.488506,,,,,,,,,"{""phaeopigments"":0.1093088}"
49222,49223,,PAL2021,-64.81500,-64.04050,2021-05-04 00:00:00,SIZ,,,,...,0.430152,,,,,,,,,"{""phaeopigments"":0.190364512}"
49223,49224,,PAL2021,-64.81500,-64.04050,2021-05-04 00:00:00,SIZ,,,,...,0.355125,,,,,,,,,"{""phaeopigments"":0.081884662}"


1) Palmer LTER dataset
- Link to source and info

In [3]:
lter1 = pd.read_csv("../data/in/datasets/unmodified/AntarcticaLTERcompiledData_Cruise_forEDI.csv")
lter2 = pd.read_csv("../data/in/datasets/unmodified/AntarcticaLTERcompiledData_Station_forEDI.csv")

lter_sql: dict = {"DatetimeGMT": "timestamp", "Latitude": "latitude", "Longitude": "longitude",
                  "Depth": "depth", "Temperature": "temperature", "Salinity": "salinity", "Density": "density",
                  "Chlorophyll": "chl_a", "Fluorescence": "fluorescence", "Phaeopigment": "phaeopigments",
                  "PrimaryProduction": "primary_prod", "studyName": "cruise", "PAR": "par",
                  "Prasinophytes": "prasinophytes", "Cryptophytes": "cryptophytes",
                  "MixedFlagellates": "mixed_flagellates", "Diatoms": "diatoms", "Haptophytes": "haptophytes",
                  "NO3": "nitrate", "NO2": "nitrite", "DIC1": "diss_inorg_carbon", "DOC": "diss_org_carbon",
                  "POC": "part_org_carbon", "SiO4": "silicate", "N": "tot_nitrogen",
                  "PO4": "phosphate", "Notes1": "notes"}

lter1 = lter1[lter_sql.keys()].rename(columns=lter_sql)
lter2 = lter2[lter_sql.keys()].rename(columns=lter_sql)
lter = pd.concat([lter1, lter2])
lter = lter.dropna(subset=['timestamp', 'longitude', 'latitude'])
# TODO: warning if any values were dropped
lter = lter[lter['longitude'].between(-180, 180)]
lter = lter[lter['latitude'] <= -30]
# Group chemtax into three main categories
lter['percent_phaeo'] = lter['haptophytes']
lter['percent_diatom'] = lter['diatoms']
lter['percent_other'] = lter['prasinophytes'] + lter['mixed_flagellates'] + lter['cryptophytes']

data_gdf = GeoDataFrame(lter, geometry=gpd.points_from_xy(lter['longitude'], lter['latitude']), crs='EPSG:4326')
data_gdf = data_gdf.to_crs(crs=ccrs.SouthPolarStereo())
zones_gdf = gpd.read_file(geolabel.zones_shapefile).to_crs(ccrs.SouthPolarStereo())
# Spatially join data points with zones (polygons) to get labelled data
lter = DataFrame(data_gdf.sjoin(zones_gdf, how='left').drop(columns=['geometry', 'index_right']))
extra = lter.columns.difference(sophysql.get_table_cols("sample"))
lter["extra_json"] = lter[extra].agg(lambda r: r[r.notna()].to_json(), axis=1)
lter = lter.drop(extra, axis=1)
lter

Unnamed: 0,timestamp,latitude,longitude,depth,temperature,salinity,chl_a,cruise,par,nitrate,nitrite,silicate,phosphate,percent_phaeo,percent_diatom,percent_other,front_zone,extra_json
0,1991-11-07 00:36:00,-64.83333,-64.05167,2.14998,-1.7618,33.791,0.397000,PD91-09,,,,,,0.000000,0.000603,0.999397,SIZ,"{""cryptophytes"":0.0,""diatoms"":0.000602825,""hap..."
1,1991-11-07 00:36:00,-64.83333,-64.05167,10.48500,-1.7445,33.798,0.298000,PD91-09,,,,,,,,,SIZ,"{""notes"":""**HPLC in datazoo file listed as ug\..."
2,1991-11-07 00:36:00,-64.83333,-64.05167,20.44900,-1.6974,33.820,0.265000,PD91-09,,,,,,0.000000,0.000000,1.000000,SIZ,"{""cryptophytes"":0.0,""diatoms"":0.0,""haptophytes..."
3,1991-11-07 00:36:00,-64.83333,-64.05167,30.60000,-1.6188,33.846,,PD91-09,,,,,,,,,SIZ,"{""part_org_carbon"":123.16,""tot_nitrogen"":17.0}"
4,1991-11-07 00:36:00,-64.83333,-64.05167,40.54660,-1.4717,33.879,0.284000,PD91-09,,,,,,0.073917,0.388492,0.537591,SIZ,"{""cryptophytes"":0.001472699,""diatoms"":0.388492..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19341,2021-05-04 00:00:00,-64.81500,-64.04050,20.00000,,,0.533521,PAL2021,,,,,,,,,SIZ,"{""phaeopigments"":0.113480187}"
19342,2021-05-04 00:00:00,-64.81500,-64.04050,35.00000,,,0.488506,PAL2021,,,,,,,,,SIZ,"{""phaeopigments"":0.1093088}"
19343,2021-05-04 00:00:00,-64.81500,-64.04050,50.00000,,,0.430152,PAL2021,,,,,,,,,SIZ,"{""phaeopigments"":0.190364512}"
19344,2021-05-04 00:00:00,-64.81500,-64.04050,65.00000,,,0.355125,PAL2021,,,,,,,,,SIZ,"{""phaeopigments"":0.081884662}"


In [4]:
lter.to_sql(name='sample', con=con, if_exists='append', index=False)

49225

2) Joy-Warren 2019 dataset
- Link to source and info

In [18]:
"""Writes Joy-Warren 2019 dataset to database"""
joyw: DataFrame = pd.read_csv('../data/in/datasets/modified/joy_warren.csv', encoding='utf-8')
# group by depth and station: station1([1.7, 1.8, 2.1][9.7. 9.8, 10.2]...), station2...
joyw = joyw.groupby([joyw["depth"].pct_change().abs().gt(0.15).cumsum(), "station"]).mean(numeric_only=True)
joyw = joyw.reset_index(level=0, drop=True).reset_index(level=0).sort_values(by='depth')
joyw.sort_values(by=['station', 'depth'])
# --------------------------------------------------
jwchemtax = pd.read_csv('../data/in/datasets/modified/joy_warren_chemtax.csv')
jwchemtax = jwchemtax.dropna().sort_values(by='depth')
joyw: DataFrame = pd.merge_asof(jwchemtax, joyw, by='station', on='depth', direction='nearest',
                                  tolerance=2).sort_values(by='station')

joyw['timestamp'] = pd.to_datetime(joyw['date'], format='%Y%m%d', errors='coerce').dropna().drop(
    columns=['date', 'time'])
joyw['source_name'] = 'joyw'
joyw['percent_phaeo'] = joyw['haptophytes']
joyw['percent_diatom'] = joyw['diatoms']
joyw['percent_other'] = joyw['chlorophytes'] + joyw['mixed_flagellates'] + joyw['cryptophytes']
# ----------------------------------------------------
joyw = joyw.reset_index(drop=True)
max_id: int = pd.read_sql("select max(id) from sample", con=con)['max(id)'][0] + 1
joyw['id'] = np.arange(max_id, max_id + len(joyw))
jwmkey = pd.concat([joyw['id'], joyw['station'], joyw['depth']], axis=1).sort_values(by='depth')


extra = joyw.columns.difference(sophysql.get_table_cols("sample"))
joyw["extra_json"] = joyw[extra].agg(lambda r: r[r.notna()].to_json(), axis=1)
joyw = joyw.drop(columns=extra, axis=1)

microscopy = pd.read_csv('../data/in/datasets/modified/joy_warren_microscopy.csv', encoding='utf-8').dropna()
replace: tuple = ('centric', 'pennate', 'unknown diatom', 'dinoflagellate', 'ciliate', 'silicoflagellate')
are_taxa = ~microscopy['taxa'].isin(replace)
taxa: DataFrame = pd.read_csv("../data/in/worms/joy_warren_worms.csv", encoding='utf-8').rename(sophytaxa.worms_sql)
# ----------------------------------
taxa.index = microscopy[are_taxa].index
microscopy['aphia_id'] = taxa['AphiaID']
microscopy = microscopy.sort_values(by='depth')
microscopy = pd.merge_asof(microscopy, jwmkey, by='station', on='depth', direction='nearest', tolerance=1)
microscopy = microscopy.rename({'id': 'sample_id', 'taxa': 'name', 'group': 'groups'},
                               axis="columns")
microscopy = microscopy[microscopy.columns.intersection(sophysql.get_table_cols("microscopy"))]

data_gdf = GeoDataFrame(joyw, geometry=gpd.points_from_xy(joyw['longitude'], joyw['latitude']), crs='EPSG:4326')
data_gdf = data_gdf.to_crs(crs=ccrs.SouthPolarStereo())
zones_gdf = gpd.read_file(geolabel.zones_shapefile).to_crs(ccrs.SouthPolarStereo())
# Spatially join data points with zones (polygons) to get labelled data
joyw = DataFrame(data_gdf.sjoin(zones_gdf, how='left').drop(columns=['geometry', 'index_right'])).drop_duplicates(subset=['id'])
joyw

Unnamed: 0,name,biovolume,aphia_id,sample_id
0,pennate,613332,,49607
1,Proboscia,39235,149167.0,49607
2,Thalassiosira,3187660,148912.0,49607
3,dinoflagellate,455,,49607
4,Banquisia,96678,291402.0,49607
...,...,...,...,...
132,Pseudo-nitzschia,165508,149151.0,49594
133,Phaeocystis antarctica,263589832,341585.0,49594
134,silicoflagellate,304,,49594
135,Guinardia,603307,149111.0,49594


In [21]:
joyw.to_sql(name='sample', con=con, if_exists='append', index=False)
microscopy.to_sql(name='microscopy', con=con, if_exists='append', index=False)

137

3) Alderkamp dataset
- Link to source and info

4) Garibotti dataset
- Link to source and info