In [1]:
import pandas as pd
import numpy as np

In [2]:
diss = pd.read_csv("../compilation/data/in_situ_rates_compiled_allvar_uncertainty.csv")
diss.columns

Index(['Depth', 'Source', 'Source_abbrev', 'Latitude', 'Longitude', 'Sample',
       'Material', 'Rate_sa', 'Rate_error_sa', 'Organics', 'Device',
       'Deployment_d', 'Biogenic', 'Year', 'Comments', 'Mesh', 'Rate_mass',
       'Month', 'Size', 'Fragmentation_pct', 'Rate_error_mass', 'Temp_CDisk4',
       'pH_CDisk4_T25', 'Salinity_CDisk4', 'TA_CDisk4', 'DIC_CDisk4_calc13',
       'Oca_CDisk4_calc13', 'Oar_CDisk4_calc13', 'Omega_CDisk4_calc13',
       'Pressure', 'Temp_woa', 'Temp_SD_woa', 'Sal_woa', 'Sal_SD_woa',
       'Doxy_woa', 'Doxy_SD_woa', 'PO4_woa', 'SiOH4_woa', 'TA_cb', 'DIC_cb',
       'NO3_cb', 'PO4_cb', 'SiOH4_cb', 'pH_insitu_cb', 'k_calcite_calc23',
       'Oca_calc23', 'k_aragonite_calc23', 'Oar_calc23',
       'total_calcium_calc23', 'CO2_calc23', 'HCO3_calc23', 'CO3_calc23',
       'Omega_calc23', 'Oca_metab_calc12', 'Oar_metab_calc12',
       'Omega_metab_calc12', 'u_TA_woa', 'u_DIC_woa', 'u_pH_insitu_woa',
       'u_SiOH4_woa', 'u_PO4_woa', 'u_TA_woa_cb', 'u_DIC_wo

## encode categorical data

In [3]:
# use correct Omega from CDisk4 cruise
diss["Omega_use"] = np.where(diss["Omega_CDisk4_calc13"].isna(), 
                                             diss["Omega_calc23"], diss["Omega_calc23"])

# only where I have mass normaized data and where I don't have values for Omega
diss = diss.dropna(subset = ['Rate_mass', 'Omega_use'])


# add new column to dataset with categorical data
# 0 is inorganic
# 1 is treated
# 2 is untreated
conditions  = [diss["Biogenic"] == False, (diss["Organics"] == False), diss["Organics"] == True ]
choices     = [ 0, 1, 2 ]
diss["inorg_treated"] = np.select(conditions, choices, default=np.nan)

# add new column for sizes as numbers (otherwise nan is treated as a separate category instead of no data)
conditions  = [diss["Size"] == "XXXS", diss["Size"] == "XXS", diss["Size"] == "XS", diss["Size"] == "S", diss["Size"] == "M", 
               diss["Size"] == "L", diss["Size"] == "XL", diss["Size"] == "XXL"]
choices     = [1,2,3,4,5,6,7,8]
diss["Size_num"] = np.select(conditions, choices, default=np.nan)
# diss = diss.sort_values(by=["Size_numeric"], ascending=True)

# add new column for method how dissolution was determined
# 0 is weight loss
# 1 is d13C
# 2 is surface roughness
conditions  = [diss["Source_abbrev"] == "s22", diss["Source_abbrev"] == "d19", diss["Source_abbrev"] == "n19",
               diss["Source_abbrev"] == "t97"]
choices     = [1,1,1,2]
diss["Method_num"] = np.select(conditions, choices, default=0)

# Calicte is 0
# Aragonite is 1
# Mg-calcite not part
diss = diss[(diss['Material'] == 'Calcite') | (diss['Material'] == 'Aragonite')]
diss['Material_num'] = np.where((diss["Material"] == 'Calcite'), 0, 1)

# Device used is 1
diss['Device_num'] = np.where((diss["Device"]), 1, 0)

# add new column with location
# 0 is Atlantic
# 1 is Pacific
diss["Location"] = np.where((diss["Longitude"] > -80) & (diss["Longitude"] < 0), 0, 1)


## get rid of missing data

In [4]:
# delete rows I am using where were data is missing 
# this deletes all missing rows (because)

diss = diss[['Depth', 'Source_abbrev', 'Latitude', 'Longitude',
       'Sample', 'Material', 'Organics', 'Device', 'PO4_cb',
       'Deployment_d', 'Biogenic', 'Comments', 'Mesh',
       'Rate_mass', 'Size', 'Temp_woa', 'Sal_woa', 
       'TA_cb', 'DIC_cb', 'Doxy_woa', 'Pressure', 'pH_insitu_cb', 
        'Omega_calc23', "Omega_use", "Size_num", "Deployment_d", 'Material_num',
          "Device_num", "inorg_treated", "Mesh", 'Method_num']]


#diss = diss[diss['Rate_mass'] > 0.001]

diss = diss.dropna()
diss.shape

(579, 31)

In [5]:
for s in diss['Source_abbrev'].unique():
    print(s, diss[diss['Source_abbrev'] == s].shape)

M77 (106, 31)
HE78 (29, 31)
T81 (14, 31)
M82 (65, 31)
F08 (298, 31)
N19 (26, 31)
D19 (20, 31)
S22 (21, 31)


## save dataset

In [6]:
diss.to_csv("data/in_situ_data_for_regression.csv", index=False)