In [4]:
# read data/qc_finalized_data.csv and print its columns names
import pandas as pd
df = pd.read_csv('data/qc_finalized_data.csv')
print(df.columns)

Index(['Line', 'JD Range (days)', 'τJAV (days)', 'τJAV Ref.', 'τcent (days)',
       'τpeak (days)', 'σline (km s-1)', 'FWHM (km s-1)', 'RM Ref(s)',
       'log LAGN,5100 (ergs s-1)', 'L Ref.', 'varname', 'object_name',
       'alternate_names', 'ra', 'dec', 'z', 'dl_mpc', 'da_mpc', 'f_used',
       'mbh_hbeta_only', 'mbh_all_lines', 'mbh_rm_modeling', 'mbh_rm_ref',
       'source_url', 'τJAV (days)_val', 'τJAV (days)_plus',
       'τJAV (days)_minus', 'τcent (days)_val', 'τcent (days)_plus',
       'τcent (days)_minus', 'τpeak (days)_val', 'τpeak (days)_plus',
       'τpeak (days)_minus', 'σline (km s-1)_val', 'σline (km s-1)_plus',
       'σline (km s-1)_minus', 'FWHM (km s-1)_val', 'FWHM (km s-1)_plus',
       'FWHM (km s-1)_minus', 'log LAGN,5100 (ergs s-1)_val',
       'log LAGN,5100 (ergs s-1)_plus', 'log LAGN,5100 (ergs s-1)_minus',
       'mbh_hbeta_only_mantissa', 'mbh_hbeta_only_plus_mant',
       'mbh_hbeta_only_minus_mant', 'mbh_hbeta_only_exp',
       'mbh_hbeta_only_msun_

In [5]:
# read data/model_dataset_object_level.csv and store it in a variable called df (just df)
df_obj = pd.read_csv('data/model_dataset_object_level.csv')


In [10]:
print(df.shape)
print(df["varname"].nunique(), "\n\n")

print(df_obj.shape)
print(df_obj.isna().sum(), "\n\n")

print(df_obj["target_hbeta_source"].value_counts())
print(list(df_obj.columns))

(261, 80)
67 


(67, 9)
varname                  0
tau_cent_median          0
sigma_line_median        0
fwhm_median              1
logL_median             20
n_measurements_total     0
n_hbeta_rows             0
target_hbeta_log10       0
target_hbeta_source      0
dtype: int64 


target_hbeta_source
website    67
Name: count, dtype: int64
['varname', 'tau_cent_median', 'sigma_line_median', 'fwhm_median', 'logL_median', 'n_measurements_total', 'n_hbeta_rows', 'target_hbeta_log10', 'target_hbeta_source']


In [11]:
raw = pd.read_csv("data/qc_finalized_data.csv", sep=",", encoding="utf-8-sig")

cols = [
    "varname", "object_name",
    "mbh_hbeta_only",
    "mbh_hbeta_only_mantissa", "mbh_hbeta_only_exp",
    "mbh_hbeta_only_log10_val",
    "target_hbeta_log10", "target_hbeta_source"
]

print(raw.loc[raw["varname"] == 12, cols].drop_duplicates())

    varname object_name                                 mbh_hbeta_only  \
46       12     NGC3227  0.484 ( ^+ 0.100 / _- 0.101 ) (10 ^7 M _sun )   

    mbh_hbeta_only_mantissa  mbh_hbeta_only_exp  mbh_hbeta_only_log10_val  \
46                    0.484                 0.0                 -0.315155   

    target_hbeta_log10 target_hbeta_source  
46           -0.315155             website  


In [13]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

IN_PATH = Path("data/qc_finalized_data.csv")
df = pd.read_csv(IN_PATH, sep=",", encoding="utf-8-sig")

MBH_STR = "mbh_hbeta_only"
MANT = "mbh_hbeta_only_mantissa"
EXP  = "mbh_hbeta_only_exp"
LOGV = "mbh_hbeta_only_log10_val"
TARGET = "target_hbeta_log10"

# 1) Most reliable: extract exponent specifically from the "(10 ^k ...)" unit block
exp_re_paren = re.compile(r"\(\s*10\s*\^?\s*([+-]?\d+)", re.IGNORECASE)

# 2) Backup: standalone token 10 (NOT inside 0.100), then ^k
exp_re_token = re.compile(r"\b10\b\s*\^?\s*([+-]?\d+)", re.IGNORECASE)

def extract_exp(s):
    if pd.isna(s):
        return np.nan
    s = str(s)
    m = exp_re_paren.search(s)
    if m:
        return float(m.group(1))
    m = exp_re_token.search(s)
    if m:
        return float(m.group(1))
    return np.nan

# Update exponent wherever we have an MBH string that contains "(10"
mask_units = df[MBH_STR].notna() & df[MBH_STR].astype(str).str.contains(r"\(\s*10", regex=True, na=False)
df.loc[mask_units, EXP] = df.loc[mask_units, MBH_STR].apply(extract_exp)

# Recompute log10 mass wherever mantissa and exponent exist
mask_recalc = df[MANT].notna() & df[EXP].notna()
df.loc[mask_recalc, LOGV] = np.log10(df.loc[mask_recalc, MANT].astype(float)) + df.loc[mask_recalc, EXP].astype(float)

# Keep target aligned with corrected Hβ log-mass
df.loc[mask_recalc, TARGET] = df.loc[mask_recalc, LOGV]

# Sanity check for varname=12
print(df.loc[df["varname"] == 12,
             ["varname","object_name",MBH_STR,MANT,EXP,LOGV,TARGET,"target_hbeta_source"]]
      .drop_duplicates())

OUT_PATH = Path("data/qc_finalized_data_patched.csv")
df.to_csv(OUT_PATH, index=False, sep=",", encoding="utf-8-sig")
print("Saved patched file:", OUT_PATH)


    varname object_name                                 mbh_hbeta_only  \
46       12     NGC3227  0.484 ( ^+ 0.100 / _- 0.101 ) (10 ^7 M _sun )   

    mbh_hbeta_only_mantissa  mbh_hbeta_only_exp  mbh_hbeta_only_log10_val  \
46                    0.484                 7.0                  6.684845   

    target_hbeta_log10 target_hbeta_source  
46            6.684845             website  
Saved patched file: data/qc_finalized_data_patched.csv
