In [None]:

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import pdist, squareform
from sklearn.linear_model import LinearRegression
from datetime import datetime
import matplotlib.pyplot as plt


columnas_fecha = []
for col in df_time_series.columns:
    try:
        datetime.strptime(col.strip(), "%d/%m/%Y")
        columnas_fecha.append(col)
    except ValueError:
        continue

# 3. Derretir (melt) el DataFrame para convertir columnas de fecha en filas
df_prices = df_time_series.melt(
    id_vars=["ID"],  # columnas fijas (puedes ajustar)
    value_vars=columnas_fecha,
    var_name="Date",
    value_name="Value"
)
df_prices["Date"] = pd.to_datetime(df_prices["Date"])
# ------------------------------
# 1. Selección de variables numéricas y resumen series de tiempo
# ------------------------------

# A) Variables numéricas (excepto ID)
df_num = df_attributes.select_dtypes(include=[np.number]).drop(columns=["ID"])

# B) Crear resumen de precios por ID
def summarize_prices(df_prices):
    df_prices["log_price"] = np.log1p(df_prices["Value"])
    summary = df_prices.groupby("ID")["log_price"].agg([
        'mean', 'std'
    ]).rename(columns={"mean": "price_mean", "std": "price_std"})

    # Tendencia: regresión lineal día vs. precio
    df_prices['day'] = (pd.to_datetime(df_prices['Date']) - pd.to_datetime(df_prices['Date']).min()).dt.days
    trends = []

    for _, group in df_prices.groupby('ID'):
        X = group['day'].values.reshape(-1, 1)
        y = np.log1p(group['Value'].values)
        if len(np.unique(X)) > 1:
            slope = LinearRegression().fit(X, y).coef_[0]
        else:
            slope = 0.0
        trends.append(slope)

    summary['price_trend'] = trends
    return summary

df_price_summary = summarize_prices(df_prices)

# Unir con atributos por ID
df_merged = df_attributes.merge(df_price_summary, left_on="ID", right_index=True)

# ------------------------------
# 2. Eliminar baja varianza y alta correlación
# ------------------------------

# Selección de numéricas y variables temporales
df_all_num = df_merged.select_dtypes(include=[np.number]).drop(columns=["ID"])



# ------------------------------
# 3. Escalado
# ------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_all_num)

# ------------------------------
# 4. Ya aplicamos log1p al precio en el resumen → no necesario aquí
# ------------------------------

# ------------------------------
# 5. PCA (80%)
# ------------------------------

pca = PCA(n_components=0.60)
X_pca = pca.fit_transform(X_scaled)
pca_full = PCA().fit(X_scaled)
explained_variance = np.cumsum(pca_full.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.axhline(y=0.60, color='r', linestyle='--', label='60% varianza')
plt.axvline(x=pca.n_components_, color='g', linestyle='--', label=f'{pca.n_components_} componentes')

plt.title('PCA - Varianza Explicada Acumulada')
plt.xlabel('Número de componentes principales')
plt.ylabel('Varianza explicada acumulada')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
print(f" PCA: {pca.n_components_} componentes conservan {pca.explained_variance_ratio_.sum():.2%} de la varianza")

# ------------------------------
# 6. Amenidades binarias → TruncatedSVD
# ------------------------------

binary_columns = df_attributes.columns[
    (df_attributes.dtypes == 'int64') & 
    (df_attributes.nunique() == 2) &
    (df_attributes.columns != 'ID')
]

X_binary = df_attributes[binary_columns]



# ------------------------------
# 7. Concatenar espacio final
# ------------------------------

# Aseguramos que IDs coincidan
ids = df_attributes["ID"].values
df_spacePCA = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])], index=ids)

df_spacePCA["ID"] = df_spacePCA.index

# ------------------------------
# 8. Matriz de distancias
# ------------------------------

dist_matrixPCA = squareform(pdist(df_spacePCA.drop(columns="ID").values, metric='euclidean'))
df_distPCA = pd.DataFrame(dist_matrixPCA, index=df_spacePCA["ID"], columns=df_spacePCA["ID"])

print(" Matriz de distancias calculada. Espacio de puntos listo para TDA.")


In [None]:
import plotly.express as px
import plotly.colors as pc

def plot_PCA_svd_space(df_space, color_column=None, discrete_colorscale=False):
    """
    Gráfico 3D del espacio embebido (PCA)
    
    Args:
        df_space (pd.DataFrame): DataFrame con columnas ['PCi', ..., 'ID']
        color_column (str, opcional): Columna para colorear los puntos.
        discrete_colorscale (bool): Si True, trata color_column como categórica ordenada.
    """
    df_plot = df_space.copy()

    if color_column:
        df_plot = df_plot.merge(df_attributes[['ID', color_column]], on='ID', how='left')
        
        if discrete_colorscale:
            # Generar una escala de color inspirada en hot_r con 10 colores
            hot_r_colors = pc.sequential.Hot_r[::-1]  # reversa del colormap Hot
            n_colors = df_plot[color_column].nunique()
            colors = pc.sample_colorscale(hot_r_colors, [i / (n_colors - 1) for i in range(n_colors)])

            # Crear un mapping de valor → color
            unique_vals = sorted(df_plot[color_column].dropna().unique())
            color_map = dict(zip(unique_vals, colors))

            df_plot["color_hex"] = df_plot[color_column].map(color_map)
            color_values = df_plot["color_hex"]
        else:
            color_values = df_plot[color_column]
    else:
        color_values = None

    fig = px.scatter_3d(
        df_plot,
        x="PC1", y="PC2", z="PC3",
        color=color_values,
        hover_name="ID",
        opacity=0.85,
        color_continuous_scale="hot",
        range_color=(0, 180),
        title="Espacio Embebido (PCA)",
        template="plotly_dark"
    )
    
    fig.update_layout(
        margin={"r":0, "t":40, "l":0, "b":0},
        paper_bgcolor="#111111",
        plot_bgcolor="#111111",
        font=dict(color="white"),
        scene=dict(
            xaxis=dict(title="PC1", backgroundcolor="#111111", gridcolor="gray"),
            yaxis=dict(title="PC2", backgroundcolor="#111111", gridcolor="gray"),
            zaxis=dict(title="PC3", backgroundcolor="#111111", gridcolor="gray")
        )
    )
    return fig



fig = plot_PCA_svd_space(df_spacePCA,color_column="Base fee")
fig.show()

In [None]:
'''
import matplotlib.pyplot as plt
import gudhi as gd
import numpy as np
import imageio

plotfiles = []

distance_matrixPCA = df_distPCA.values
max_lengthsPCA = np.linspace(3, 5, 16)  # 8 frames desde 1 a 2.5

for i, max_len in enumerate(max_lengthsPCA):
    print(f"Generando frame {i+1} con max_edge_length={max_len:.2f}")
    # Crear complejo Rips
    rips_complexPCA = gd.RipsComplex(distance_matrix=distance_matrixPCA, max_edge_length=max_len)
    simplex_treePCA = rips_complexPCA.create_simplex_tree(max_dimension=3)

    coordspca = df_spacePCA[["PC1", "PC2", "PC3"]].values

    fig, axs = plt.subplots(1, 3, figsize=(15, 5), facecolor='black')
    for ax in axs:
        ax.set_facecolor('black')
        ax.tick_params(colors='white')  # Ejes blancos
        ax.xaxis.label.set_color('white')
        ax.yaxis.label.set_color('white')
        ax.title.set_color('white')

    def draw_edges(ax, dim1, dim2):
        # Dibujar nervios
        for simplex, _ in simplex_treePCA.get_skeleton(1):
            if len(simplex) == 2:
                i_, j_ = simplex
                x = [coordspca[i_, dim1], coordspca[j_, dim1]]
                y = [coordspca[i_, dim2], coordspca[j_, dim2]]
                ax.plot(x, y, color='gray', linewidth=1)
        # Dibujar puntos azul claro
        ax.scatter(coordspca[:, dim1], coordspca[:, dim2], c='#add8e6', s=20)
        ax.set_xlabel(f"PC{dim1+1}")
        ax.set_ylabel(f"PC{dim2+1}")

    draw_edges(axs[0], 0, 1)
    axs[0].set_title("PC1 vs PC2")

    draw_edges(axs[1], 0, 2)
    axs[1].set_title("PC1 vs PC3")

    draw_edges(axs[2], 1, 2)
    axs[2].set_title("PC2 vs PC3")

    plt.suptitle(f"Complejo de Vietoris-Rips (max_edge_length={max_len:.2f})", color='white')
    plt.tight_layout(rect=[0, 0, 1, 0.95])

    filename = f"rips_frame_{i}.png"
    plt.savefig(filename, facecolor='black')
    plotfiles.append(filename)
    plt.close()

# Crear gif
images = [imageio.imread(f) for f in plotfiles]
imageio.mimsave('rips_complex_PCA_animation.gif', images, fps=2, loop=0)

print("GIF creado: rips_complex_PCA_animation.gif")
'''

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_rips_complex_PCA_3d(df_space, simplex_tree, color_column=None):
    """
    Visualiza en 3D el complejo de Vietoris-Rips (puntos + aristas).
    """
    df_plot = df_space.copy()

    # Extraer coordenadas 3D
    coords = df_plot[["PC1", "PC2", "PC3"]].values
    id_map = dict(zip(df_plot["ID"], range(len(df_plot))))  # ID a índice

    # Color opcional
    if color_column:
        df_plot = df_plot.merge(df_attributes[['ID', color_column]], on='ID', how='left')
        color_values = df_plot[color_column]
    else:
        color_values = None

    # --- Crear trazas de aristas (1-símplices) ---
    edges = []
    for simplex, filtration in simplex_tree.get_skeleton(1):
        if len(simplex) == 2:
            i, j = simplex
            xi, yi, zi = coords[i]
            xj, yj, zj = coords[j]
            edges.append(go.Scatter3d(
                x=[xi, xj, None],  # None para separar segmentos
                y=[yi, yj, None],
                z=[zi, zj, None],
                mode='lines',
                line=dict(color='gray', width=1),
                hoverinfo='none',
                showlegend=False
            ))

    # --- Crear traza de puntos ---
    scatter = go.Scatter3d(
        x=df_plot["PC1"],
        y=df_plot["PC2"],
        z=df_plot["PC3"],
        mode='markers',
        marker=dict(
            size=4,
            color=color_values if color_column else 'lightblue',
            colorscale="Hot",
            opacity=0.8,
            colorbar=dict(title=color_column) if color_column else None
        ),
        text=df_plot["ID"],
        name="Puntos"
    )

    # --- Figura final ---
    fig = go.Figure(data=[scatter] + edges)
    fig.update_layout(
        title="Complejo de Vietoris–Rips en espacio UMAP+SVD",
        template="plotly_dark",
        margin=dict(l=0, r=0, b=0, t=40),
        scene=dict(
            xaxis_title="UMAP1",
            yaxis_title="UMAP2",
            zaxis_title="UMAP3",
            xaxis=dict(backgroundcolor="#111111", gridcolor="gray"),
            yaxis=dict(backgroundcolor="#111111", gridcolor="gray"),
            zaxis=dict(backgroundcolor="#111111", gridcolor="gray")
        )
    )
    return fig
distance_matrixPCA = df_distPCA.values

rips_complexPCA = gd.RipsComplex(distance_matrix=distance_matrixPCA, max_edge_length=3)
simplex_treePCA = rips_complexPCA.create_simplex_tree(max_dimension=3)

print(f"✔️ Complejo Rips (por matriz de distancias) creado con {simplex_treePCA.num_simplices()} símplices.")
for simplex, filtration in simplex_treePCA.get_filtration():
    if len(simplex) <= 2:
        print(f"Simplex: {simplex}, Filtration: {filtration}")
# === USO ===
fig = plot_rips_complex_PCA_3d(df_spacePCA, simplex_treePCA,"Base fee")
fig.show()


In [None]:
def plot_PCA_rips_projections(df_space, simplex_tree, color_column=None):
    """
    Muestra proyecciones XY, XZ, YZ del complejo de Rips con subplots.
    """
    df_plot = df_space.copy()
    coords = df_plot[["PC1", "PC2", "PC3"]].values

  
    if color_column:
        df_plot = df_plot.merge(df_attributes[['ID', color_column]], on='ID', how='left')
        color_values = df_plot[color_column]
    else:
        color_values = 'lightblue'

    # Crear figura con 3 subplots
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=("PC1 vs PC2", "PC1 vs PC3", "PC2 vs PC3"),
        horizontal_spacing=0.05
    )

    def get_edges(dim1, dim2):
        edge_traces = []
        for simplex, _ in simplex_tree.get_skeleton(1):
            if len(simplex) == 2:
                i, j = simplex
                xi, yi = coords[i, dim1], coords[i, dim2]
                xj, yj = coords[j, dim1], coords[j, dim2]
                edge_traces.append(go.Scatter(
                    x=[xi, xj, None], y=[yi, yj, None],
                    mode='lines',
                    line=dict(color='gray', width=1),
                    hoverinfo='none',
                    showlegend=False
                ))
        return edge_traces

    # Nodos (puntos)
    scatter_args = dict(
        mode='markers',
        marker=dict(
            size=4,
            color=color_values,
            colorscale='Hot',
            showscale=True if color_column else False,
            opacity=0.8,
            colorbar=dict(title=color_column) if color_column else None
        ),
        text=df_plot["ID"],
        hoverinfo='text'
    )

    # --- Subplot 1: XY ---
    for trace in get_edges(0, 1): fig.add_trace(trace, row=1, col=1)
    fig.add_trace(go.Scatter(
        x=df_plot["PC1"], y=df_plot["PC2"], **scatter_args
    ), row=1, col=1)

    # --- Subplot 2: XZ ---
    for trace in get_edges(0, 2): fig.add_trace(trace, row=1, col=2)
    fig.add_trace(go.Scatter(
        x=df_plot["PC1"], y=df_plot["PC3"], **scatter_args
    ), row=1, col=2)

    # --- Subplot 3: YZ ---
    for trace in get_edges(1, 2): fig.add_trace(trace, row=1, col=3)
    fig.add_trace(go.Scatter(
        x=df_plot["PC2"], y=df_plot["PC3"], **scatter_args
    ), row=1, col=3)

    fig.update_layout(
            height=500,
            width=1200,
            title_text="Proyecciones del complejo de Vietoris–Rips (1-símplices) con PCA",
            template="plotly_dark",
            margin=dict(l=20, r=20, t=60, b=20),
    )
    return fig

distance_matrixPCA = df_distPCA.values

rips_complexPCA = gd.RipsComplex(distance_matrix=distance_matrixPCA, max_edge_length=3)
simplex_treePCA = rips_complexPCA.create_simplex_tree(max_dimension=3)

# === USO ===
fig = plot_PCA_rips_projections(df_spacePCA, simplex_treePCA,"Base fee")
fig.show()