# Introduction

This notebook contains the data cleaning and amalgamation for the final plot of the main body of my paper. 
The notebook also explores some other samples that we do not include, and includes brief descriptions of the methodology, limitations and attributes of each study.

# TODO pay attention to RL, CEL corrections and dwarf galaxy catagorizations

# Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import vice
from scipy.optimize import curve_fit
from astropy.io import fits
import json
import matplotlib as mpl
import os
import toml

In [None]:
from surp import ViceModel, yields, subgiants, DATA_DIR
from surp import gce_math as gcem

import surp
import arya
from arya import COLORS

In [None]:
surp.set_yields()

In [None]:
import sys; sys.path.append("..")

In [None]:
from singlezone import run_singlezone, exp_sfh

In [None]:
def to_nice(apogee_name):
    return "[" + apogee_name.title().replace("_", "/") + "]"

In [None]:
def plot_abund_errs(df, x="O_H", y="C_O", fmt="o", **kwargs):
    xs = df[x].values
    ys =  df[y].values
    xerr = df[f"{x}_err"].values
    yerr = df[f"{y}_err"].values
    filt = ~np.isnan(xerr) 
    filt &= ~np.isnan(yerr)
    filt &= xerr > 0
    filt &= yerr > 0
    
    plt.errorbar(xs[filt], ys[filt], xerr=xerr[filt], yerr=yerr[filt], fmt=fmt, capsize=0, **kwargs)
    plt.xlabel(to_nice(x))
    plt.ylabel(to_nice(y))

In [None]:
def plot_sample(df, **kwargs):    
    plot_abund_errs(df, x="FE_H", y="O_FE", **kwargs)
    plt.show()
    
    plot_abund_errs(df, **kwargs)
    plt.show()
    
    plot_abund_errs(df, x="O_FE", **kwargs)
    plt.show()
    
    if "MG_FE" in df.columns:
        plot_abund_errs(df, x="O_H", y="O_MG", **kwargs)
        plot_abund_errs(df, x="MG_H", y="C_MG", **kwargs)
        plot_abund_errs(df, x="MG_FE", y="C_MG", **kwargs)
        plt.show()
                    

In [None]:
def calc_errs(df, idx=None):
    series = pd.Series()
    series["O_H_err"] = np.nanmean(df["O_H_err"])
    series["C_O_err"] = np.nanmean(df["C_O_err"])
    
    if idx is None:
        O_H = np.mean(df.O_H)
        C_O = np.mean(df.C_O)
        
        idx = np.argmin((df.O_H - O_H)**2 )#+ (df.C_O - C_O)**2)
        series["O_H"] = df.O_H.iloc[idx]
        series["C_O"] = df.C_O.iloc[idx]
    
    return series.to_frame().T

In [None]:
def plot_sample_err(df, df_err, color=COLORS[0], marker="*", label="", **kwargs):
    plt.scatter(df["O_H"], df["C_O"], marker=marker, color=color, label=label, **kwargs)
    

    plt.errorbar(df_err["O_H"], df_err["C_O"],  xerr=df_err.O_H_err, yerr=df_err.C_O_err,
             marker="none", ls="none", color=color, capsize=0, **kwargs)
    
    plt.xlabel("[O/H]")
    plt.ylabel("[C/O]")

In [None]:
c_o_correction = np.log10(12.01/15.99) - np.log10(vice.solar_z("c") / vice.solar_z("o"))
eps_o_correction =  - 12 + np.log10(15.99) - np.log10(vice.solar_z("o"))

In [None]:
import toml

In [None]:
def contains_min_cols(d, cols):
    for col in cols:
        if col not in d.keys():
            print("missing", col)
            return False
    return True

In [None]:
def toml_to_df(y):
    df = pd.DataFrame()

    for row in y:
        row_df = pd.DataFrame(row)
        df = pd.concat([df, row_df], ignore_index=True)
        
    return df

In [None]:
def parsetoml(filename):
    df = {}
    with open(filename, "r") as f:
        df = toml.load(f)
    return df

# Stars
Samples of MW stars.
See also `catalogues` and `clean_surveys` notebooks.

## Amarsi et al. (2019)
3D NLTE corrected sample

Builds on: 
- Nissen et al. (2014)
- Amarsi et al. (2019a) which therein builds on  Nissen et al. (2007). which contains most of what Fabbian et al. (2009) presents.

In [None]:
amarsi19 = pd.read_csv(surp.DATA_DIR + "amarsi_19.tsv", delimiter="\t *", comment="#", skiprows=[62, 63],)

amarsi19["FE_H"] = amarsi19["[Fe/H]3L"]
amarsi19["FE_H_err"] = amarsi19["e_[Fe/H]3L"]

amarsi19["O_H"] = amarsi19["[O/H]3N"]
amarsi19["O_H_err"] = amarsi19["e_[O/H]3N"]

amarsi19["C_H"] = amarsi19["[C/H]3N"]
amarsi19["C_H_err"] = amarsi19["e_[C/H]3N"]

amarsi19["C_O"] = amarsi19["C_H"] - amarsi19["O_H"] 
amarsi19["C_O_err"] = amarsi19["O_H_err"] + amarsi19["C_H_err"] 

amarsi19["O_FE"] = amarsi19["O_H"] - amarsi19["[Fe/H]3L"]
amarsi19["O_FE_err"] = amarsi19["e_[Fe/H]3L"] + amarsi19["O_H_err"]

amarsi19_sun = amarsi19.iloc[151]
amarsi19.drop(index=151, inplace=True)

In [None]:
plot_sample(amarsi19)

In [None]:
for pop in amarsi19.Pop.unique():
    filt = amarsi19.Pop == pop
    plot_abund_errs(amarsi19[filt], label=pop)

arya.Legend(-1)


In [None]:
for pop in amarsi19.Pop.unique():
    filt = amarsi19.Pop == pop
    plt.scatter(amarsi19[filt].O_FE, amarsi19[filt]["C_O"], label=pop)

arya.Legend(-1)
plt.xlabel("[O/Fe]")
plt.ylabel("C_O")

### NLTE vs LTE

while e.g. Fe is stable, C requires up to -0.2 dex corrections at low metallicities.

In [None]:
plt.errorbar(amarsi19.FE_H, amarsi19["[Fe/H]1L"] - amarsi19["[Fe/H]3L"], yerr=amarsi19["e_[Fe/H]3L"], fmt="o", capsize=0)
plt.xlabel("[Fe/H]")
plt.ylabel("3D LTE - 1D LTE")

In [None]:
plt.errorbar(amarsi19.C_H, amarsi19["[C/H]1L"] - amarsi19["[C/H]3N"], yerr=amarsi19["e_[C/H]3N"], fmt="o", capsize=0)
plt.xlabel("[C/H]")
plt.ylabel("1D LTE - 3D NLTE")

In [None]:
plt.errorbar(amarsi19.O_H, amarsi19["[O/H]1L"] - amarsi19["[O/H]3N"], yerr=amarsi19["e_[O/H]3N"], fmt="o", capsize=0)
plt.xlabel("[O/H]")
plt.ylabel("1D LTE - 3D NLTE")

In [None]:
amarsi19.to_csv("amarsi19_cleaned.csv")

## Zhao et al. 2016
https://ui.adsabs.harvard.edu/abs/2016ApJ...833..225Z/abstract


NLTE 1D abundances. MW thin, thick, and halo stars.
- 51 nearby stars -
- lick on 3m with R~60k from 3700-9300Å, S/N > 100 for most.
- about 10 C lines, most are around 9000Å.
- NonLTE - LTE for C is ~0 at 5052Å, but are  ~ -0.1dex  at 9111Å. not correlated with Fe.
- Also includes CH and C2 bands


In [None]:
z16_long = pd.read_csv(DATA_DIR + "zhao+2016.tsv", delimiter="\t[ ]*", comment="#")

In [None]:
z16_long

In [None]:
z16 = z16_long.pivot_table(index='Name', columns='Species', 
    values=['[X/H]LTE', 'e_[X/H]LTE', '[X/Fe]LTE', '[X/H]NLTE', 'e_[X/H]NLTE','[X/Fe]NLTE'], aggfunc='first')


In [None]:
z16.columns = [a.replace("X", b.strip()).replace("NLTE", "") for a, b in z16.columns.values]

In [None]:
z16 = pd.merge(z16_long[["Name", "Teff", "logg", "[Fe/H]", "xi"]].drop_duplicates(), z16, on='Name')

In [None]:
z16.columns.values

In [None]:
plt.scatter(z16.Teff, z16.logg)
plt.xlabel("Teff")
plt.ylabel("logg")

In [None]:
z16["C_H"] = z16["[C I/H]"] 
z16["C_H_err"] = z16["e_[C I/H]"] 


z16["C_O"] = z16["[C I/H]"] - z16["[O I/H]"]
z16["C_O_err"] = z16["e_[C I/H]"] - z16["e_[O I/H]"]

z16["O_H"] = z16["[O I/H]"]
z16["O_H_err"] = z16["e_[O I/H]"]

z16["FE_H"] = z16["[Fe I/H]"]
z16["FE_H_err"] = z16["e_[Fe I/H]"]

z16["O_FE"] = z16["[O I/Fe]"]
z16["O_FE_err"] = z16["e_[O I/H]"] + z16["e_[Fe I/H]"]

In [None]:
plt.errorbar(z16.C_H, z16["[C I/H]LTE"] - z16.C_H, yerr=z16.C_H_err, fmt="o", capsize=0)
plt.xlabel("[C/H]")
plt.ylabel("1D LTE - 3D NLTE")

In [None]:
plot_sample(z16)

In [None]:
plt.errorbar(z16.C_H, z16["[C I/H]LTE"] - z16.C_H, yerr=z16.C_H_err, fmt="o", capsize=0)
plt.xlabel("[C/H]")
plt.ylabel(" LTE - NLTE")

In [None]:
plt.errorbar(z16.C_H, z16["[CH/Fe]LTE"] - z16.C_H + z16.FE_H, yerr=z16.C_H_err, fmt="o", capsize=0)
plt.xlabel("[C/H]")
plt.ylabel("CH - C I")

In [None]:
plt.errorbar(z16.O_H, z16["[O I/H]LTE"] - z16.O_H, yerr=z16.O_H_err, fmt="o", capsize=0)
plt.xlabel("[O/H]")
plt.ylabel("1D LTE - 3D NLTE")

In [None]:
z16.to_csv("zhao+16_cleaned.csv")

## Combined

In [None]:
plot_abund_errs(z16)
plot_abund_errs(amarsi19)

## Extra

### bedell 18
solar twins sample. https://ui.adsabs.harvard.edu/abs/2018ApJ...865...68B/abstract.
Doesn't help with global trends since all stars almost same


In [None]:
b18 = pd.read_csv(DATA_DIR + "bedell18.dat", delimiter=" +", comment="#")

In [None]:
plt.scatter(b18["[OI/H]"], b18["[CI/H]"] - b18["[OI/H]"])
plt.xlabel("[O/H]")
plt.ylabel("[C/O]")

### Bensby et al. (2019, 2021)
The galactic bulge sample

In [None]:
bensby21 = pd.read_csv(DATA_DIR + "bensby21.tsv", delimiter="\t *", comment="#",engine='python')
bensby19 = pd.read_csv(DATA_DIR + "bensby19.tsv", delimiter="\t *", comment="#",engine='python')

bensby = bensby21.set_index("Name").join(bensby19.set_index("Name"),rsuffix="_b19", how="left")

bensby["C_O"] = bensby["[C/H]"] - bensby["[O/H]"]
bensby["C_O_err"] = bensby["e_[C/O]"]

bensby["O_FE"] = bensby["[O/H]"] - bensby["[Fe/H]"]
bensby["O_FE_err"] = bensby["e_[O/H]"] + bensby["e_[Fe/H]"]

bensby["O_H"] = bensby["[O/H]"]
bensby["O_H_err"] = bensby["e_[O/H]"]

bensby["FE_H"] = bensby["[Fe/H]"]
bensby["FE_H_err"] = bensby["e_[Fe/H]"]


In [None]:
plot_sample(bensby, alpha=0.2)

### Fabbian et al. 2009
Rehashed in Amarsi

In [None]:
F09 = pd.read_csv(DATA_DIR + "Fabbian09.csv", sep="\s+")

# RL

work from
- Esteban et al . (2002, 2009, 2014)
- García-Rojas et al. (2007)
- López-Sánchez et al. (2007)
- Mendez-Delgado et al. 2022
- Skillman et al. (2020)
- Toribio San Cipriano et al. 2016, 2017.
- Peimbert et al. 2005


Other
- Peimberg 2003 does 30 Dor in LMC (like TSC)
- García-Rojas 2003, 2004, 2006 (galactic regions)
- Tsamis et al. 2003. (galactic and LMC, covered by more modern studies)



## Esteban + 09

In [None]:
E09 = pd.read_csv(DATA_DIR +  "nearby_RL/esteban+09.tsv", comment="#", sep=r"\t")
E09["O_H"] = gcem.eps_to_brak(E09.eps_o, "O")
E09["O_H_err"] = E09.o_err
E09["C_H"] = gcem.eps_to_brak(E09.eps_c, "C")
E09["C_H_err"] =E09.c_err
E09["C_O"] = E09.C_H - E09.O_H
E09["C_O_err"] = E09.C_H_err + E09.O_H_err
E09["study"] = "esteban+09"
E09

In [None]:
E09.sort_values("eps_o")

VS-24 and VS-38 have large abundance discrepancies from Garnett measurements, so I believe this is why they are excluded?

In [None]:
plot_abund_errs(E09.loc[~np.isin(E09.region, ["VS-24", "VS-38"])])

Reproduced except for NGC 5447 which I believe they take the average of t^2 > 0 and t^2 = 0 values for oxygen whereas I just use t^2 > 0.

## Esteban + 2014

In [None]:
E14 = pd.read_csv(DATA_DIR +  "nearby_RL/esteban+14.tsv", comment="#", sep=r"\t")
E14["O_H"] = gcem.eps_to_brak(E14.eps_o, "O")
E14["O_H_err"] = E14.o_err
E14["C_H"] = gcem.eps_to_brak(E14.eps_c, "C")
E14["C_H_err"] =E14.c_err
E14["C_O"] = E14.C_H - E14.O_H
E14["C_O_err"] = E14.C_H_err + E14.O_H_err
E14["study"] = "esteban+14"
E14

In [None]:
plot_abund_errs(E14)
plt.scatter(E14.O_H, E14.C_O)

Verified. Note that only the ones with errors match. I believe they also take the average of t^2=0 and t^2>0 values, but all of these detections are very weak and the oxygen abundance is low, so it is reasonable to exclude these objects.

## Mendez-Delgado et al. 2022
MW HII regions

In [None]:
def read_md22():
    df = pd.read_csv(DATA_DIR +  "nearby_RL/mendez-delgado+22.csv", comment="#")
    df1 = pd.DataFrame()
    df1["O_H"] = gcem.eps_to_brak(df["O_H"], "o")
    df1["[c/h]"] = gcem.eps_to_brak(df["C_H"], "c")
    df1["[n/h]"] = gcem.eps_to_brak(df["N_H"], "n")

    df1["[c/n]"] = df1["[c/h]"] - df1["[n/h]"]
    df1["C_O"] = df1["[c/h]"] - df1["O_H"]
    df1["[n/o]"] = df1["[n/h]"] - df1["O_H"]

    df1["O_H_err"] = df["O_H_err"]
    df1["C_O_err"] = df["C_H_err"] + df["O_H_err"]
    df1["[n/o]_err"] = df["N_H_err"] + df["O_H_err"]
    df1["[c/n]_err"] = df["C_H_err"] + df["N_H_err"]
    df1["region"] = df.Region
    df1["R_g"] = df.R_g

    df1.name = "Milkyway"
    return df1
md22 = read_md22()
md22a = pd.read_csv(DATA_DIR + "nearby_RL/mendez-delgado+22.csv", comment="#")
md22["study"] = "mendez-delgado+22"
md22["galaxy"] = "MW"

In [None]:
md22

In [None]:
plt.errorbar(md22a.R_g, md22a.C_H, yerr=md22a.C_H_err, fmt="o")
plt.xlabel("R")
plt.ylabel("eps C")

In [None]:
plt.errorbar(md22a.R_g, md22a.O_H, yerr=md22a.O_H_err, fmt="o")
plt.xlabel("R")
plt.ylabel("eps(O)")

Verified!

In [None]:
plot_abund_errs(md22)

## Skillman et al. 2020
M101 data from CHAOS
Recombination lines

LBT
CII λ4267 with ionization corrections

In [None]:
def read_skillman20():
    df = pd.read_csv(DATA_DIR + "nearby_RL/skillman+20.tsv", sep="\t")
    df1 = pd.DataFrame()
    df1["O_H"] = gcem.eps_to_brak(df["O_H"], "o")
    df1["C_O"] = gcem.log_to_brak(df["C_O"], "c", "o")
    df1["[c/n]"] = gcem.log_to_brak(df["C_N"], "c", "n")
    df1["[n/o]"] = df1["C_O"] - df1["[c/n]"]

    df1["O_H_err"] = df["O_H_err"]
    df1["C_O_err"] = df["C_O_err"] 
    df1["[c/n]_err"] = df["C_N_err"] * 12/14
    df1["[n/o]_err"] = df["C_O_err"] + df["C_N_err"]

    df1["galaxy"] ="M101"
    df1["region"] = df.region
    df1["study"] = "skillman+20"
    return df1

skillman20 = read_skillman20()

In [None]:
plot_abund_errs(skillman20)

Verified

##  Toribio San Cipriano et al. (2016)
NGC 300 and M33 abundances using RL on UVES

In [None]:
tsc16 = pd.read_csv(f"{DATA_DIR}/nearby_RL/TSC16.tsv", sep="\t", comment="#")

In [None]:
tsc16.columns

In [None]:
tsc16["O_H"] = gcem.eps_to_brak(tsc16.eps_o, "o")
tsc16["O_H_err"] = tsc16.eps_o_err
tsc16["C_O"] = gcem.log_to_brak(tsc16.log_c_o, "c", "o")
tsc16["C_O_err"] = tsc16.log_c_o_err
tsc16["study"] = "toribo-san-cipriano+16"

In [None]:
tsc16

In [None]:
plot_abund_errs(tsc16)

Verified!

##  Toribio San Cipriano et al. (2017)
LMC and SMC abundances using RL, 5 and 4 HII regions respectively.
Compares to CEL lines
- C II 4267.15
- ~ 7 \[O II\] lines

In [None]:
tsc17 = pd.read_csv(f"{DATA_DIR}/nearby_RL/TSC17.tsv", sep="\t", comment="#")

In [None]:
tsc17["O_H"] = gcem.eps_to_brak(tsc17.eps_o, "o")
tsc17["O_H_err"] = tsc17.eps_o_err
tsc17["C_O"] = gcem.log_to_brak(tsc17.log_c_o, "c", "o")
tsc17["C_O_err"] = tsc17.log_c_o_err

tsc17["C_O_2"] = gcem.log_to_brak(tsc17.log_c_o_cel, "c", "o")
tsc17["C_O_2_err"] = tsc17.log_c_o_err
tsc17["study"] = "toribo-san-cipriano+17"

In [None]:
tsc17

In [None]:
gcem.log_to_brak(-0.5, "c", "o")

In [None]:
gcem.log_to_abundance(-3.2, "o")


In [None]:
gcem.eps_to_brak(8.2, "o")

In [None]:
plot_abund_errs(tsc17)

Verified!

In [None]:
plt.scatter(tsc17.O_H, tsc17.C_O - tsc17.C_O_2)
plt.xlabel("[O/H]")
plt.ylabel("change in [C/O] from RL-CEL")

## Misc

In [None]:
rl_raw = {}

dirname = "../../data/nearby_RL/"
for filename in os.listdir(dirname):
    if filename.startswith("_"):
        continue
    if filename.endswith(".toml"):
        print(filename)
        with open(dirname + filename, "r") as f:
            name = os.path.splitext(filename)[0]
            rl_raw[name] = toml.load(f)


In [None]:
rl_raw

In [None]:
cols = ["galaxy", "redshift", "log_HI", "log_HI_err", "log_CII", "log_CII_err", "log_OI", "log_OI_err"]

rl = pd.DataFrame()


for study, attrs in rl_raw.items():
    print(study)
    series = pd.DataFrame(attrs["regions"])
    series["study"] = study
    rl = pd.concat([rl, series], ignore_index=True, axis=0)


In [None]:
rl

In [None]:
rl["C_O"] = gcem.log_to_brak(rl.eps_C - rl.eps_O, "c", "o")
rl["C_O_err"] = rl.eps_C_err + rl.eps_O_err
rl["O_H"] = gcem.eps_to_brak(rl.eps_O, "o")
rl["O_H_err"] = rl.eps_O_err

In [None]:
df = rl.loc[rl.study == "esteban+02"]
plt.scatter(df.O_H, df.C_O)
plot_abund_errs(df)


verified

In [None]:
for label in rl.study.unique():
    df = rl.loc[rl.study == label]

    plot_abund_errs(df, label=label)
    
arya.Legend(loc=-1)

## All

In [None]:
tsc17["class"] = "magellanic"
tsc16["class"] = "spiral"

md22["class"] = "MW"
skillman20["class"] = "spiral"

In [None]:
RLs = pd.concat([rl, md22, skillman20, tsc16, tsc17, E14, E09])
RL_err = calc_errs(RLs)

In [None]:
skillman20

In [None]:
RL_clean = RLs[~np.isnan(RLs.C_O_err)]

In [None]:
for index, group in RL_clean.groupby(["galaxy", "region"]):
    if len(group) > 1:
        plot_abund_errs(group, label=index, fmt=".-")
        plt.scatter(group.O_H, group.C_O, s=1+ 3*np.arange(len(group)))
        print(index)
        print(group.study)

arya.Legend(-1)

In [None]:
for galaxy in RL_clean.galaxy.unique():
    df = RL_clean.loc[RL_clean.galaxy == galaxy].sort_values("O_H")

    if ((df["class"].iloc[0]) in ["magellanic", "spiral", "MW"]):
        fmt = "o-"
        label=galaxy
    else:
        fmt = "k^-"
        label=""
        print(df["class"].iloc[0], df["galaxy"].iloc[0], df["C_O_err"].iloc[-1])
    plot_abund_errs(df, fmt=fmt, label=label)
arya.Legend(-1)

In [None]:
for study in RL_clean.study.unique():
    df = RL_clean.loc[RL_clean.study == study]

  
    plot_abund_errs(df,label=study)
arya.Legend(-1)

## All

In [None]:
RLs = pd.concat([RLs, skillman20, md22, tsc16, tsc17], ignore_index=True)


In [None]:
plot_sample_err(RLs, RL_err)

In [None]:
RLs.to_csv("RL_combined.csv")

# CELs/


- berg 2016, 2019
- Peña-Guerrero et al. 2017 (DUST CORRECTIONS?! also look at GCE discussion)
- Leitherer ? 

## Pena-guerreno+17

Notes

In [None]:
pg17 = pd.read_csv(DATA_DIR + "nearby_CEL/pena-guerreno+17.csv", sep="\\s+", comment="#")

In [None]:
pg17["O_H"] = gcem.eps_to_brak(pg17.eps_o, "O")
pg17["O_H_err"] = pg17.eps_o_err
pg17["C_H"] = gcem.eps_to_brak(pg17.eps_c, "C")
pg17["C_H_err"] =pg17.eps_c_err
pg17["C_O"] = pg17.C_H - pg17.O_H
pg17["C_O_err"] = pg17.C_H_err + pg17.O_H_err
pg17["study"] = "pena-guerreno+17"

In [None]:
pg17

In [None]:
plot_abund_errs(pg17)

verified

In [None]:
cel_raw = {}

dirname = "../../data/nearby_CEL/"
for filename in os.listdir(dirname):
    if filename.startswith("_"):
        continue
    if filename.endswith(".toml"):
        print(filename)
        with open(dirname + filename, "r") as f:
            name = os.path.splitext(filename)[0]
            cel_raw[name] = toml.load(f)


In [None]:
CEL = pd.DataFrame()


for study, attrs in cel_raw.items():
    print(study)
    series = pd.DataFrame(attrs["regions"])
    series["study"] = study
    CEL = pd.concat([CEL, series], ignore_index=True, axis=0)


In [None]:
CEL.loc[~np.isnan(CEL.log_O_H), "eps_O"] = 12 + CEL.log_O_H
CEL.loc[~np.isnan(CEL.log_O_H), "eps_O_err"] = CEL.log_O_H_err
CEL

In [None]:
CEL["C_O"] = gcem.log_to_brak(CEL.log_C_O, "c", "o")
CEL["C_O_err"] = CEL.log_C_O_err
CEL["O_H"] = gcem.eps_to_brak(CEL.eps_O, "o")
CEL["O_H_err"] = CEL.eps_O_err

In [None]:
CEL

In [None]:
for label in CEL.study.unique():
    df = CEL.loc[CEL.study == label]

    plot_abund_errs(df, label=label)
    
arya.Legend(loc=-1)

In [None]:
for label in CEL.study.unique():
    df = CEL.loc[CEL.study == label]

    plot_abund_errs(df, label=label)
    plt.title(label)
    plt.show()


Verified
- garnett 1999
- garnett 1995
- izotov+thuan
- no plot for senchyna

In [None]:
cols = ["galaxy", "log_C_O", "log_C_O_err", "eps_O", "eps_O_err"]
min_cols = cols

In [None]:
dwarfs_misc = pd.DataFrame()


for study, attrs in local_dwarfs_raw.items():
    
    if type(attrs) is dict and contains_min_cols(attrs, min_cols):
        series = toml_to_df(attrs, cols)
        series["study"] = study
        dwarfs_misc = pd.concat([dwarfs_misc, series], ignore_index=True, axis=0)
    else:
        print("warning, incomplete data for", study)

In [None]:
dwarfs_misc["C_O"] = gcem.log_to_brak(dwarfs_misc.log_C_O, "c", "o")
dwarfs_misc["O_H"] = gcem.eps_to_brak(dwarfs_misc.eps_O, "o")
dwarfs_misc["C_O_err"] = dwarfs_misc.log_C_O_err
dwarfs_misc["O_H_err"] = dwarfs_misc.eps_O_err

In [None]:
plot_abund_errs(dwarfs_misc)

## Berg et al. 2019

CEL using UV spectroscopy from HST


In [None]:
berg19_oh = [7.738564437848462, 7.6632886767478245, 7.747268460952228, 7.750820263928443, 7.713715855761903, 7.589795836508764, 7.6974589868035785, 7.645978901768882, 7.6295676065942, 7.544970117524367, 7.612033211664193, 7.583534555372829, 7.72890577995267, 7.697318599334163, 7.712185632345273, 7.868184188359874, 7.91372588343829, 7.910117925474309, 8.026962416268903]
berg19_co = [-0.3746933819064967, -0.48816029143898, -0.5211171827565271, -0.5504310868245295, -0.5995992714025501, -0.6620036429872496, -0.7178384942319369, -0.7848573163327262, -0.8006435944140862, -0.8836429872495446, -0.9455737704918032, -1.038554948391014, -1.0339647844565878, -0.7561748633879781, -0.7804857316332725, -0.655397692774742, -0.5993321190042502, -0.7889981785063751, -0.8897632058287797]

In [None]:
surp.set_yields()

In [None]:
berg19 = pd.read_csv(DATA_DIR + "nearby_CEL/berg19.csv")
berg19 = berg19.iloc[:-1]
berg19["C_O"] = gcem.log_to_brak(berg19.log_c_o, "c", "o")
berg19["O_H"] = gcem.eps_to_brak(berg19.eps_o, "o")
berg19["C_O_err"] = berg19.log_c_o_err
berg19["O_H_err"] = berg19.eps_o_err

In [None]:
plt.scatter(berg19_oh + eps_o_correction - 0.15, berg19_co + c_o_correction, color=COLORS[1], zorder=3)
plot_abund_errs(berg19)

## Berg 2016

Carbon lines
- C IV 1548.19 (3)
- C IV 1550.77 (3)
- C III] 1906.68 (all - 1)
- C III 1908.73 (all)
about 8 O lines, and 4 H lines. No Fe.

In [None]:
berg16 = pd.read_csv(DATA_DIR + "nearby_CEL/berg16.csv")
berg16["C_O"] = gcem.log_to_brak(berg16.log_c_o, "c", "o")
berg16["O_H"] = gcem.eps_to_brak(berg16.eps_o, "o")
berg16["C_O_err"] = berg16.log_c_o_err
berg16["O_H_err"] = berg16.eps_o_err

In [None]:
berg16_eps_o = [7.45492559464121, 7.45492559464121, 7.446516385223216, 7.738760980305644, 7.812141510569171, 7.844879868436886, 7.91093217279692]
berg16_co = [-0.44879174256223436, -0.5934790528233151, -0.6725804493017608, -0.8203400121432909, -0.6636915604128719, -0.6061323618700668, -0.6431086824529447]

In [None]:
berg16_eps_o = [7.3688668986515875, 7.451109177903437, 7.4524684645498045, 7.732209656372336, 7.901859504132231, 7.808971291866029, 7.841028708133972]
berg16_co = [-0.2890780583536705, -0.45288545792930646, -0.7084163112195758, -0.8311306476258066, -0.7682472897913665, -0.6335624126469162, -0.5935121552042524]

In [None]:
plot_abund_errs(berg16)
plt.scatter(berg16_eps_o + eps_o_correction - 0.14, berg16_co + c_o_correction, color=COLORS[1], zorder=3)


## All

In [None]:
berg16["study"] = "berg+16"
berg19["study"] = "berg+19"

In [None]:
dwarfs = pd.concat([CEL, berg16, berg19, pg17, dwarfs_misc], ignore_index=True)
dwarf_err = calc_errs(dwarfs)

In [None]:
dwarfs["region"]

In [None]:
dwarfs.study.unique()

In [None]:
filt = pd.isna(dwarfs.galaxy)
dwarfs.loc[filt, "galaxy"] = dwarfs.loc[filt, "region"]
filt = pd.isna(dwarfs.galaxy)
dwarfs.loc[filt, "galaxy"] = dwarfs.loc[filt, "name"]


In [None]:
dwarfs.loc[dwarfs.galaxy.duplicated(keep=False), ["region", "galaxy", "study", "C_O", "O_H"]]

the only true duplicates are two regions in garnett+1995 covered by izotov+thuan 1999

In [None]:
dwarfs_cleaned = dwarfs.drop([3, 5])

WARNING:
make sure the above is correct since easier to manualy drop rows

In [None]:
dwarfs_cleaned[dwarfs_cleaned.galaxy.duplicated(keep=False)].study

In [None]:
np.sum(np.isnan(dwarfs_cleaned.C_O_err))

In [None]:
for study in dwarfs_cleaned.study.unique():
    df = dwarfs_cleaned[dwarfs_cleaned.study == study]
    plot_abund_errs(df, label=study)

arya.Legend(-1)

In [None]:
dwarfs_cleaned

In [None]:
RL_clean

In [None]:
plot_sample_err(dwarfs_cleaned, dwarf_err)
plot_sample_err(RL_clean, RL_err, color=arya.COLORS[1])

# TODO: galaxy-by-galaxy comparison with CEL / RL

In [None]:
dwarfs.to_csv("dwarfs_combined.csv")

# DLA
- Mostly as compiled in Cook et al. 2017.

In [None]:
import os

In [None]:
dla_raw = {}

dirname = "../../data/DLA/"
for filename in os.listdir(dirname):
    if filename.startswith("_"):
        continue
    if filename.endswith(".toml"):
        print(filename)
        with open(dirname + filename, "r") as f:
            name = os.path.splitext(filename)[0]
            dla_raw[name] = toml.load(f)


In [None]:
cols = ["galaxy", "redshift", "log_HI", "log_HI_err", "log_CII", "log_CII_err", "log_OI", "log_OI_err"]

dla = pd.DataFrame()


for study, attrs in dla_raw.items():
    print(study)
    series = pd.DataFrame(attrs["galaxies"])
    series["study"] = study
    dla = pd.concat([dla, series], ignore_index=True, axis=0)


In [None]:
dla.columns

In [None]:
dla["C_O"] = gcem.log_to_brak(dla.log_CII - dla.log_OI, "c", "o")
dla["C_O_err"] = np.sqrt(dla.log_OI_err**2 + dla.log_CII_err**2)

dla["O_H"] = gcem.eps_to_brak(12 + dla.log_OI - dla.log_HI, "o")
dla["O_H_err"] = np.sqrt(dla.log_OI_err**2 + dla.log_HI_err**2)

flagged 
- HS 0105 (C/O); okay?
- Q0913+072	 (C/O + err; good)
- Q1202+3235	(C/o; IC only)

In [None]:
c_o_correction

In [None]:
cooke17_rep= dla[np.isin(dla.study, ["omera+01", "ellison+10", "cooke+11", "cooke+14", "cooke+15", "cooke+17", "dutta+14",
                                     "dessauges-zavadsky+03", "pettini+08", "srianand+10", "morrison+16"])]
idx = np.argsort((np.int64(cooke17_rep.galaxy.str.extract("(\\d+)")))[:, 0])
idx = idx[~np.isin(cooke17_rep.galaxy.iloc[idx], ["Q2059-360", "Q1101-264"])]
idx = idx[1:] # remove duplicate first galaxy
cooke17_rep = cooke17_rep.iloc[idx][["galaxy", "redshift", "C_O", "C_O_err", "O_H", "O_H_err", "study", "log_CII"]]

cooke17_rep

In [None]:

dla.galaxy[dla.galaxy.duplicated()]

In [None]:
# remove duplicates

filt = ~((dla.galaxy == "J0035-0918") & (dla.study != "welsh+20"))
dla = dla[filt]

In [None]:
np.sum(dla.galaxy.duplicated())

In [None]:
np.sort(dla.galaxy)

In [None]:
dla[["galaxy", "study", "C_O", "O_H"]]

The table is verified. The main difference is cooke does apply the ionization correction to morrison+16; and cooke does not include the first two galaxies in 

In [None]:
def format_with_errors(value, error, precision):
    """
    Format a value and its error with LaTeX formatting and rounding precision.

    Parameters:
    - value (float): The value to format.
    - error (float): The corresponding error.
    - precision (int): The number of decimal places to round to.

    Returns:
    - str: A string formatted with LaTeX math mode including $\pm$.
    """
    return f"${value:.{precision}f} \pm {error:.{precision}f}$"

In [None]:
def dataframe_to_latex_with_errors(df, precisions, caption='', label='', index=False):
    """
    Convert a pandas DataFrame to a LaTeX table, handling columns with errors.

    Parameters:
    - df (pd.DataFrame): The DataFrame to convert.
    - precisions (dict): A dictionary specifying the rounding precision for each column.
    - caption (str): Caption for the table.
    - label (str): Label for referencing the table in LaTeX.
    - index (bool): Whether to include the DataFrame's index in the table.

    Returns:
    - str: A LaTeX tabular environment as a string.
    """
    # Create a copy of the DataFrame to modify
    df_copy = df.copy()

    # Iterate over the columns and format those with companion `_err` columns
    for col in df.columns:
        if col.endswith('_err'):
            main_col = col[:-4]  # Get the corresponding main column name
            if main_col in df.columns:
                precision = precisions.get(main_col, 2)  # Default precision to 2 if not specified
                df_copy[main_col] = df.apply(lambda row: format_with_errors(row[main_col], row[col], precision), axis=1)
                df_copy.drop(columns=[col], inplace=True)  # Drop the `_err` column after formatting

    # Convert the DataFrame to LaTeX
    latex_table = df_copy.to_latex(index=index, escape=False)

    table_env = f"\\begin{{table}}[H]\n\\centering\n{latex_table}"
    
    if caption:
        table_env += f"\\caption{{{caption}}}\n"
    if label:
        table_env += f"\\label{{{label}}}\n"
    
    table_env += "\\end{table}"
    
    return table_env

In [None]:
lt = dataframe_to_latex_with_errors(dla[["galaxy", "redshift", "C_O_err", "C_O", "O_H_err", "O_H", "study"]], precisions={})

print(lt)

In [None]:
dla[["galaxy", "redshift", "C_O", "C_O_err", "O_H", "O_H_err", "study"]]

In [None]:
ra = cooke17_rep.galaxy.str.extract("(\\d+)")

In [None]:
ra

In [None]:
idx

Verified!!!

In [None]:
plot_abund_errs(cooke17_rep)

1: Cooke et al. (2015); 2: Dutta et al. (2014); 3: Cooke et al. (2014); 4: Ellison et al. (2010); 5: Cooke et al. (2011b); 6: This work; 7: Pettini et al. (2008); 8: Morrison et al. (2016); 9: Srianand et al. (2010); 10: Cooke et al. (2012); 11: Dessauges-Zavadsky et al. (2003).

In [None]:
dla.to_csv("DLA_combined.csv")

In [None]:
dla.study.unique()

# High Z galaxies (TODO)
High z CEL mentioned in berg and jones

In [None]:
datadir = surp.DATA_DIR + "/high_redshift/"

In [None]:
toml.load(

#### Amorin+2017
Verified

In [None]:
amorin2017_raw = parsetoml(datadir + "amorin+2017")

In [None]:
amorin2017 = toml_to_df(amorin2017_raw)

In [None]:
amorin2017

In [None]:
plt.errorbar(np.log10(amorin2017.stellar_mass), amorin2017.eps_o, yerr=amorin2017.eps_o_err, fmt="o", xerr=amorin2017.stellar_mass_err / amorin2017.stellar_mass)
plt.xlabel("log stellar mass / msun")
plt.ylabel("12 + log (O/H)")

In [None]:
plt.errorbar(amorin2017.eps_o, amorin2017.log_c_o, xerr=amorin2017.eps_o_err, fmt="o", yerr=amorin2017.log_c_o_err)
plt.ylabel("log C/O")
plt.xlabel("12 + log (O/H)")

### Summary

In [None]:
high_z["O_H"] = gcem.eps_to_brak(high_z.eps_o, "o")
high_z["C_O"] = gcem.log_to_brak(high_z.log_c_o, "c", "o")
high_z["O_H_err"] = high_z.eps_o_err
high_z["C_O_err"] = high_z.log_c_o_err

In [None]:
plot_abund_errs(high_z)
plot_abund_errs(berg19)

In [None]:
high_z.to_csv("high_z_cleaned.csv")

# MISC

## FLF

full spectrum fitting to early type galaxies

In [None]:
flf = pd.read_csv(DATA_DIR + "FLF21.csv", comment="#", sep=r"\s+")

In [None]:
flf["O_H"] = flf["[O/Fe]"] + flf["[Fe/H]"]
flf["O_H_err"] = flf["[O/Fe]_err"] + flf["[Fe/H]_err"]
flf["C_O"] = flf["[C/Fe]"] - flf["[O/Fe]"]
flf["C_O_err"] = flf["[C/Fe]_err"] + flf["[O/Fe]_err"]


In [None]:
flf.name.unique()


In [None]:
for name in flf.name.unique():
    df = flf.loc[flf.name == name]
    plot_abund_errs(df, label=name)

arya.Legend(-1)