In [1]:
#Inventory 2022 auxiliar Cruise Processor
#Libraries
from core.libs import pd, tqdm, Path, json, re
from core.paths import resolve_inventory_paths

In [None]:
# Inventory 2022 auxiliar Cruise Processor
from core.libs import pd, tqdm, Path, re

# --- Carga el cheatsheet filtrado ---
cheatsheet_path = r"C:\Users\HeyCe\World Tree Technologies Inc\Forest Inventory - Documentos\USA\2022_ForestInventory\8-ForestMetrix_Projects\inventory_us_2022_cheatsheet.csv"
df_cheat = pd.read_csv(cheatsheet_path)

df_valid = df_cheat[
    (df_cheat["TreeFilePath"].notna()) &
    (df_cheat["TreeFilePath"] != "NA") &
    (df_cheat["case"].str.lower() != "missing")
].copy()

print(f"Archivos válidos a procesar: {len(df_valid)}")

def pick_sheet(excel_file):
    sheets = pd.ExcelFile(excel_file).sheet_names
    for sheet in sheets:
        if sheet.lower().strip() in ("input", "sheet1"):
            return sheet
    return sheets[0]

frames = []
for idx, row in tqdm(df_valid.iterrows(), total=len(df_valid), desc="Leyendo archivos"):
    excel_path = Path(row["TreeFilePath"])
    contract_code = row["contractcode"]
    cruise_date = row["inventory_date"]
    try:
        sheet = pick_sheet(excel_path)
        df = pd.read_excel(excel_path, sheet_name=sheet)
        df.columns = [str(c).strip() for c in df.columns]
        df['archivo_origen'] = str(excel_path)
        df['contractcode'] = contract_code
        df['cruise_date'] = cruise_date
        frames.append(df)
    except Exception as e:
        print(f"Error en {excel_path}: {e}")

if not frames:
    raise Exception("❌ Ningún archivo pudo ser leído correctamente.")

df_all = pd.concat(frames, ignore_index=True)
df_all = df_all.dropna(axis=1, how='all')

print(df_all.head(5))

In [2]:
# --- Exporta el CSV final ---
output = Path(r"C:\Users\HeyCe\World Tree Technologies Inc\Forest Inventory - Documentos\USA\2022_ForestInventory\inventory_us_2022_concentrado.csv")
df_all.to_csv(output, index=False)

#print(f"\n✅ CSV concentrado generado en:\n{output}")
print(f"Columnas: {list(df_all.columns)}")
print(f"Filas: {len(df_all)}")


Columnas: ['Stands::StandNumber', 'Points::PointNumber', 'GradingsSerialNumberAtPoint', 'Tree_ID', 'Points::WT Permanent', 'WT Status', 'WT Species', 'WT Leaf', 'WT Defect', 'WT DHT', 'WT DBH', 'WT THT', 'WT MHT', 'WT Pests', 'jenkinsTotalAgBiomassGREEN', 'GradingsComment', 'Preferences::WorldTreePrefsInventoryType', 'Preferences::WorldTreeUnits', 'Preferences::prefsFileName', 'archivo_origen', 'contractcode', 'cruise_date', 'Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'x', 'x.1', 'x.2', 'x.3', 'x.4', 'x.5', 'x.6', 'x.7', 'x.8', 'x.9', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 41']
Filas: 2850


In [3]:
#Lo limpié manualmente porque era más fácil lol

In [5]:
from core.libs import pd
from core.schema_helpers import rename_columns_using_schema
from CruisesProcessor.general_processing import process_inventory_dataframe
from CruisesProcessor.general_importer import prepare_df_for_sql, save_inventory_to_sql
from core.db import get_engine

# 1. Lee el CSV limpio (solo columnas de texto, nunca _id)
df = pd.read_csv(r"C:\Users\HeyCe\World Tree Technologies Inc\Forest Inventory - Documentos\USA\2022_ForestInventory\inventory_us_2022_concentrado2.csv")
df = rename_columns_using_schema(df)

# 2. (Opcional pero recomendable) QA: asegúrate de tener las columnas crudas de catálogo
for col in ["Defect", "Species", "Pests", "Status"]:
    if col not in df.columns:
        print(f"⚠️ WARNING: '{col}' no está, se agrega vacía.")
        df[col] = pd.NA
for c in ["defect_id", "species_id", "pests_id", "status_id"]:
    if c in df.columns:
        print(f"🧹 Eliminando columna '{c}' previa.")
        df = df.drop(columns=[c])

In [6]:
print(df.head(5))

   stand  plot  tree_number       Tree_ID Points::WT Permanent  Status  \
0    1.0   1.0          1.0  6.220000e+20                  NaN  1 Live   
1    1.0   1.0          2.0  7.190430e+15                  NaN  1 Live   
2    1.0   1.0          3.0  5.470000e+19                  NaN  1 Live   
3    1.0   1.0          4.0  1.280000e+19                  NaN  1 Live   
4    1.0   1.0          5.0  1.660000e+24                  NaN  1 Live   

  Species    WT Leaf Defect  WT DHT  dbh_in  tht_ft  merch_ht_ft Pests  \
0   CL502  4 76-100%    NaN     NaN     NaN     2.0          NaN   NaN   
1   CL502  4 76-100%    NaN     NaN     NaN     1.0          NaN   NaN   
2   CL502  4 76-100%    NaN     NaN     NaN     1.0          NaN   NaN   
3   CL502  4 76-100%    NaN     NaN     NaN     2.0          NaN   NaN   
4   CL502  4 76-100%    NaN     NaN     NaN     0.5          NaN   NaN   

  jenkinsTotalAgBiomassGREEN short_note  \
0                          ?        NaN   
1                       

In [8]:

# 3. Procesa TODO el DataFrame igual que en main.py
engine = get_engine()
country_code = "US"

💻 Conectado a la base de datos helloworldtree


In [13]:

# Si falta Disease (como en 2022), créala vacía para evitar el KeyError
if "Disease" not in df.columns:
    print("⚠️ El formulario de 2022 no tiene columna 'Disease', se crea vacía.")
    df["Disease"] = pd.NA
if "Coppiced" not in df.columns:
    print("⚠️ El formulario de 2022 no tiene columna 'Coppiced', se crea vacía.")
    df["Coppiced"] = pd.NA
if "Permanent Plot" not in df.columns:
    print("⚠️ El formulario de 2022 no tiene columna 'Permanent Plot', se crea vacía.")
    df["Permanent Plot"] = pd.NA

df_good, df_bad = process_inventory_dataframe(df, engine, country_code)

print(f"✔️ Filas válidas: {len(df_good)} | ❌ Filas con error: {len(df_bad)}")

if not df_bad.empty:
    df_bad.to_excel("bad_rows_inventory_us_2022.xlsx", index=False)

⚠️ El formulario de 2022 no tiene columna 'Permanent Plot', se crea vacía.
🌳 Doyle calculado

=== 🪵 Resumen de imputación ===
Árboles imputados: 102

📋 Por contrato:
contractcode
US0009    15
US0011     4
US0042    12
US0046     5
US0051    10
US0110    14
US0129    12
US0135    10
US0137    12
US0138     8
Name: count, dtype: int64
✔️ Filas válidas: 3028 | ❌ Filas con error: 0


In [19]:
from core.schema_helpers import rename_columns_using_schema, get_dtypes_for_dataframe, FINAL_ORDER

# 1. Renombra columnas usando schema (si hace falta)
df_sql = rename_columns_using_schema(df_good)

# 2. Alinea y rellena columnas según el orden oficial
for col in FINAL_ORDER:
    if col not in df_sql.columns:
        df_sql[col] = pd.NA
df_sql = df_sql[FINAL_ORDER]

# 3. Castea los tipos correctos
dtypes = get_dtypes_for_dataframe(df_sql)
for col, dtype in dtypes.items():
    if col in df_sql.columns:
        try:
            df_sql[col] = df_sql[col].astype(dtype.python_type, errors="ignore")
        except Exception:
            pass

# 4. Limpia NAs para SQL (usar clean_for_sql si quieres)
df_sql = df_sql.astype(object).where(df_sql.notna(), None)

# 5. Asegura la tabla y mete los datos
ensure_table(df_sql, engine, "inventory_us_2022", recreate=False)
save_inventory_to_sql(
    df_sql,
    engine,
    "inventory_us_2022",
    if_exists="append",
    dtype=dtypes,
    progress=True
)



=== INICIO DE IMPORTACIÓN ===
💻 Conectado a la base de datos helloworldtree


Insertando → inventory_us_2022: 100%|██████████| 4/4 [00:00<00:00,  5.31filas/s]

✅ Bulk insert completado: 
 'inventory_us_2022' (3028 filas)



