In [6]:
import pandas as pd
import numpy as np

PATH = "dataset_gbif_micorrizas_limpio.csv"

def read_any_csv(path):
    # intenta coma, punto y coma, tab
    for sep in [",", ";", "\t", "|"]:
        try:
            df = pd.read_csv(path, sep=sep, engine="python", on_bad_lines="skip")
            # si quedó 1 sola columna gigante, es que el separador no era ese
            if df.shape[1] > 1:
                return df, sep
        except Exception:
            pass
    # último intento: autodetectar con python
    df = pd.read_csv(path, sep=None, engine="python", on_bad_lines="skip")
    return df, "auto"

df, sep_used = read_any_csv(PATH)
print("sep usado:", repr(sep_used))
print("shape:", df.shape)
print(df.columns)
df.head()


sep usado: ','
shape: (803, 6)
Index(['familia', 'latitud', 'longitud', 'pais', 'provincia', 'localidad'], dtype='str')


Unnamed: 0,familia,latitud,longitud,pais,provincia,localidad
0,Glomeraceae,-35.9649,-72.245183,CL,Maule,No especificado
1,Glomeraceae,-34.738941,-71.068992,CL,O'Higgins,No especificado
2,Glomeraceae,-37.741717,-72.306583,CL,Biobío,No especificado
3,Glomeraceae,-35.981283,-72.2444,CL,Maule,No especificado
4,Glomeraceae,-38.082175,-72.954658,CL,Araucanía,No especificado


In [7]:
# Normaliza nombres a minúsculas sin espacios
df.columns = [c.strip().lower() for c in df.columns]

# Esperamos estas (según tu dataset limpio)
# familia, latitud, longitud, pais, provincia, localidad
use = ["familia","latitud","longitud","pais","provincia"]
df = df[use].copy()

# limpiar strings
df["familia"] = df["familia"].astype(str).str.strip()
df["pais"] = df["pais"].fillna("OTRO").astype(str).str.upper().str.strip()
df["provincia"] = df["provincia"].fillna("OTRO").astype(str).str.strip()

# numeric
df["latitud"] = pd.to_numeric(df["latitud"], errors="coerce")
df["longitud"] = pd.to_numeric(df["longitud"], errors="coerce")
df = df.dropna(subset=["latitud","longitud","familia"])

print(df.shape)
df.head()


(803, 5)


Unnamed: 0,familia,latitud,longitud,pais,provincia
0,Glomeraceae,-35.9649,-72.245183,CL,Maule
1,Glomeraceae,-34.738941,-71.068992,CL,O'Higgins
2,Glomeraceae,-37.741717,-72.306583,CL,Biobío
3,Glomeraceae,-35.981283,-72.2444,CL,Maule
4,Glomeraceae,-38.082175,-72.954658,CL,Araucanía


In [8]:
top_k = 8
top_fams = df["familia"].value_counts().head(top_k).index.tolist()
dfk = df[df["familia"].isin(top_fams)].copy()

print("muestras:", len(dfk))
print("clases:", dfk["familia"].nunique())
print(dfk["familia"].value_counts())


muestras: 782
clases: 8
familia
Glomeraceae          291
Acaulosporaceae      175
Entrophosporaceae    138
Gigasporaceae         81
No especificado       34
Ambisporaceae         24
Archaeosporaceae      24
Diversisporaceae      15
Name: count, dtype: int64


In [9]:
def make_encoder(series):
    vals = sorted(set(series.tolist()))
    enc = {"OTRO": 0}
    nxt = 1
    for v in vals:
        if v == "OTRO":
            continue
        enc[v] = nxt
        nxt += 1
    return enc

enc_pais = make_encoder(dfk["pais"])
enc_prov = make_encoder(dfk["provincia"])

fam_list = sorted(dfk["familia"].unique().tolist())
fam_to_id = {f:i for i,f in enumerate(fam_list)}
id_to_fam = {i:f for f,i in fam_to_id.items()}

dfk["pais_id"] = dfk["pais"].map(enc_pais).fillna(0).astype(np.int32)
dfk["prov_id"] = dfk["provincia"].map(enc_prov).fillna(0).astype(np.int32)
dfk["y"] = dfk["familia"].map(fam_to_id).astype(np.int32)

pais_vocab = max(enc_pais.values()) + 1
prov_vocab = max(enc_prov.values()) + 1
num_classes = len(id_to_fam)

print("pais_vocab:", pais_vocab, "prov_vocab:", prov_vocab, "classes:", num_classes)
print("max pais_id:", dfk["pais_id"].max(), "max prov_id:", dfk["prov_id"].max())


pais_vocab: 2 prov_vocab: 13 classes: 8
max pais_id: 1 max prov_id: 12


In [10]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X_num = dfk[["latitud","longitud"]].astype("float32").values
X_pais = dfk[["pais_id"]].astype("int32").values
X_prov = dfk[["prov_id"]].astype("int32").values
y = dfk["y"].astype("int32").values

Xn_tr, Xn_te, Xp_tr, Xp_te, Xv_tr, Xv_te, y_tr, y_te = train_test_split(
    X_num, X_pais, X_prov, y, test_size=0.2, random_state=42, stratify=y
)

num_in  = tf.keras.Input(shape=(2,), dtype=tf.float32, name="num")
pais_in = tf.keras.Input(shape=(1,), dtype=tf.int32, name="pais")
prov_in = tf.keras.Input(shape=(1,), dtype=tf.int32, name="prov")

norm = tf.keras.layers.Normalization()
norm.adapt(Xn_tr)
x_num = norm(num_in)

x_pais = tf.keras.layers.Embedding(input_dim=pais_vocab, output_dim=8)(pais_in)
x_pais = tf.keras.layers.Flatten()(x_pais)

x_prov = tf.keras.layers.Embedding(input_dim=prov_vocab, output_dim=8)(prov_in)
x_prov = tf.keras.layers.Flatten()(x_prov)

x = tf.keras.layers.Concatenate()([x_num, x_pais, x_prov])
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)

out = tf.keras.layers.Dense(num_classes, activation="softmax")(x)

model = tf.keras.Model([num_in, pais_in, prov_in], out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit([Xn_tr, Xp_tr, Xv_tr], y_tr, validation_split=0.2, epochs=20, batch_size=32)
print("Eval:", model.evaluate([Xn_te, Xp_te, Xv_te], y_te, verbose=0))


Epoch 1/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.2780 - loss: 1.9541 - val_accuracy: 0.3360 - val_loss: 1.8761
Epoch 2/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3580 - loss: 1.8101 - val_accuracy: 0.4000 - val_loss: 1.7298
Epoch 3/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3960 - loss: 1.6989 - val_accuracy: 0.4000 - val_loss: 1.6356
Epoch 4/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4080 - loss: 1.6443 - val_accuracy: 0.4000 - val_loss: 1.6069
Epoch 5/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4040 - loss: 1.6373 - val_accuracy: 0.4000 - val_loss: 1.5826
Epoch 6/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4020 - loss: 1.6248 - val_accuracy: 0.4160 - val_loss: 1.5768
Epoch 7/20
[1m16/16[0m [32m━━━━━━━━━

In [11]:
import json, zipfile, os

tflite = tf.lite.TFLiteConverter.from_keras_model(model).convert()
open("modelo_micorrizas.tflite","wb").write(tflite)

json.dump({str(k): v for k, v in id_to_fam.items()},
          open("label_map_familia.json","w",encoding="utf-8"),
          ensure_ascii=False)

json.dump(enc_pais, open("encoder_pais.json","w",encoding="utf-8"), ensure_ascii=False)
json.dump(enc_prov, open("encoder_provincia.json","w",encoding="utf-8"), ensure_ascii=False)

with zipfile.ZipFile("assets_micorrizas.zip","w") as z:
    for f in ["modelo_micorrizas.tflite","label_map_familia.json","encoder_pais.json","encoder_provincia.json"]:
        z.write(f)

print("ZIP listo:", os.path.abspath("assets_micorrizas.zip"))


INFO:tensorflow:Assets written to: C:\Users\Carlo\AppData\Local\Temp\tmp2p8yc8u2\assets


INFO:tensorflow:Assets written to: C:\Users\Carlo\AppData\Local\Temp\tmp2p8yc8u2\assets


Saved artifact at 'C:\Users\Carlo\AppData\Local\Temp\tmp2p8yc8u2'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 2), dtype=tf.float32, name='num'), TensorSpec(shape=(None, 1), dtype=tf.int32, name='pais'), TensorSpec(shape=(None, 1), dtype=tf.int32, name='prov')]
Output Type:
  TensorSpec(shape=(None, 8), dtype=tf.float32, name=None)
Captures:
  2147943101072: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943100880: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943097424: TensorSpec(shape=(1, 2), dtype=tf.float32, name=None)
  2147943094160: TensorSpec(shape=(1, 2), dtype=tf.float32, name=None)
  2147943101648: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943100112: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943102992: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943103184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2147943104144: TensorSpec(shape=

In [13]:
import numpy as np
import tensorflow as tf

itp = tf.lite.Interpreter(model_path="modelo_micorrizas.tflite")
itp.allocate_tensors()

ins = itp.get_input_details()
outs = itp.get_output_details()

print("INPUTS:")
for d in ins:
    print(" name:", d["name"], " index:", d["index"], " shape:", d["shape"], " dtype:", d["dtype"])

# Detectar inputs por nombre (lo normal: num, pais, prov)
def find_input(substr):
    for d in ins:
        if substr in d["name"].lower():
            return d
    return None

d_num  = find_input("num")
d_pais = find_input("pais")
d_prov = find_input("prov")

# Fallback por shape si no aparecen los nombres:
if d_num is None:
    d_num = next(d for d in ins if tuple(d["shape"]) == (1,2))
if d_pais is None or d_prov is None:
    ones = [d for d in ins if tuple(d["shape"]) == (1,1)]
    # intenta separar por nombre parcial, si no, deja el orden pero imprime para revisar
    if d_pais is None:
        d_pais = next((d for d in ones if "pais" in d["name"].lower()), ones[0])
    if d_prov is None:
        d_prov = next((d for d in ones if "prov" in d["name"].lower()), ones[1] if len(ones)>1 else ones[0])

# ====== prepara un ejemplo ======
num  = np.array([[-35.448843, -71.813049]], dtype=np.float32)
pais = np.array([[0]], dtype=np.int32)   # prueba OTRO primero (0)
prov = np.array([[0]], dtype=np.int32)   # prueba OTRO primero (0)

itp.set_tensor(d_num["index"],  num)
itp.set_tensor(d_pais["index"], pais)
itp.set_tensor(d_prov["index"], prov)

itp.invoke()
pred = itp.get_tensor(outs[0]["index"])[0]
print("OK -> sum:", float(pred.sum()), "top:", int(pred.argmax()))


INPUTS:
 name: serving_default_num:0  index: 0  shape: [1 2]  dtype: <class 'numpy.float32'>
 name: serving_default_prov:0  index: 1  shape: [1 1]  dtype: <class 'numpy.int32'>
 name: serving_default_pais:0  index: 2  shape: [1 1]  dtype: <class 'numpy.int32'>
OK -> sum: 0.9999999403953552 top: 6
