In [1]:
import dash
import dash_bootstrap_components as dbc
from dash import html, dcc, dash_table
import sqlite3
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import scipy.stats as stats
import plotly.express as px
import re
import random
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.figure_factory as ff
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import pdist, squareform
import umap

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr = pd.read_sql_query("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts = pd.read_sql_query("SELECT * FROM TimeSeriesRaw", _conn, dtype={"ID": str})
df_ts_interp = pd.read_sql_query("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()

In [2]:
"""
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr = pd.read_sql_query("SELECT * FROM Attributes", _conn)
df_ts = pd.read_sql_query("SELECT * FROM TimeSeriesRaw", _conn)
df_ts_interp = pd.read_sql_query("SELECT * FROM TimeSeriesInterpolated", _conn)
_conn.close()

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import pdist, squareform
from sklearn.linear_model import LinearRegression
from datetime import datetime
import matplotlib.pyplot as plt
import umap
import plotly.express as px
import plotly.colors as pc
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import gudhi as gd
"""



In [3]:
red = "#7e0d24"  # dark red color for plots

import re
dates = [col for col in df_ts.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", col)]
df_ts_interp = df_ts_interp.dropna(subset=dates, how="any").reset_index(drop=True)

In [4]:
#""""
df_attr=df_attr[[
   'Name', 'Host', 'Base fee', 'Cleaning fee', 'URL', 'ID', 'latitude',
   'longitude', 'Property type', 'Person capacity', 'accuracy_rating',
   'checking_rating', 'cleanliness_rating', 'communication_rating',
   'location_rating', 'value_rating', 'satisfaction_rating', 'Reviews',
   'Bedrooms', 'Beds', 'Baths', 'City skyline view', 'Beach view',
   'Sea/Lake view', 'Hot water', 'Jacuzzi', 'Shared pool', 'Shared gym',
   'Patio or balcony', 'Outdoor furniture', 'Outdoor playground',
   'Elevator', 'Carport', 'Dedicated workspace', 'AC', 'Heating', 'TV',
   'Cable TV', 'Wifi', 'Laundry service', 'Kitchen', 'Dining table',
   'Microwave', 'Dishes and silverware', 'Refrigerator', 'Stove',
   'Washer', 'Pets allowed', 'Crib', 'Security cameras', 'Lock on door', 'Keypad']]
#df_attr["Base fee"]=df_attr["Base fee"].map(lambda x:  min(x//25*25,125) )
#"""

In [5]:
#"""
#df_attr["Base fee"]=df_attr["Base fee"].map(lambda x:  min(x//25*25,125) )
#df_attr = df_attr.iloc[:, :21]
#"""

In [6]:
def build_umap_and_distances():
    # Drop some columns that are not needed
    df_temp = df_attr.copy()
    df_temp=df_temp[[
    'Name', 'Host', 'Base fee', 'Cleaning fee', 'URL', 'ID', 'latitude',
    'longitude', 'Property type', 'Person capacity', 'accuracy_rating',
    'checking_rating', 'cleanliness_rating', 'communication_rating',
    'location_rating', 'value_rating', 'satisfaction_rating', 'Reviews',
    'Bedrooms', 'Beds', 'Baths', 'City skyline view', 'Beach view',
    'Sea/Lake view', 'Hot water', 'Jacuzzi', 'Shared pool', 'Shared gym',
    'Patio or balcony', 'Outdoor furniture', 'Outdoor playground',
    'Elevator', 'Carport', 'Dedicated workspace', 'AC', 'Heating', 'TV',
    'Cable TV', 'Wifi', 'Laundry service', 'Kitchen', 'Dining table',
    'Microwave', 'Dishes and silverware', 'Refrigerator', 'Stove', 'Keypad',
    'Washer', 'Pets allowed', 'Crib', 'Security cameras', 'Lock on door']]

    # Melt time series data to long format
    df_prices = (
        df_ts_interp.copy()
        .melt(id_vars="ID", value_vars=dates, var_name="Date", value_name="Value")
        .assign(Date=lambda d: pd.to_datetime(d["Date"], dayfirst=True))
    )

    # Summarize per‐listing log‐price mean, std, and trend
    def summarize(group):
        #y = np.log1p(group["Value"].replace(0, np.nan))  # Avoid log(0)
        y = np.log1p(group["Value"])
        #y = group["Value"]
        days = (group["Date"] - group["Date"].min()).dt.days.values.reshape(-1,1)
        lr = LinearRegression().fit(days, y) if len(np.unique(days))>1 else None
        return pd.Series({
            "price_mean": y.mean(),
            "price_std":  y.std(),
            "price_trend": lr.coef_[0] if lr else 0.0
        })
    df_price_summary = (
        df_prices
        .groupby("ID", group_keys=False)
        .apply(summarize)
    )
    df_merged = df_attr.merge(df_price_summary, on="ID")

    # Filter out near‐constant / low‐variance features
    selector = VarianceThreshold(threshold=0.1)
    X = selector.fit_transform(df_merged.select_dtypes("number"))
    to_keep = df_merged.select_dtypes("number").columns[selector.get_support()]

    # Drop highly correlated (>0.9)
    df_reduced = pd.DataFrame(X, columns=to_keep)
    corr = df_reduced.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape, dtype=bool), k=1))
    to_drop = [c for c in upper.columns if (upper[c] > 0.9).any()]
    df_space = df_reduced.drop(columns=to_drop)

    # Scale + UMAP embedding
    X_scaled = StandardScaler().fit_transform(df_space)
    umap_proj = umap.UMAP(n_components=3, n_neighbors=30, min_dist=0.1, random_state=69).fit_transform(X_scaled)

    # Build distance matrix and UMAP space DataFrame
    df_space = df_merged.loc[df_space.index, ['ID','Base fee']].reset_index(drop=True)
    df_space[['UMAP1','UMAP2','UMAP3']] = umap_proj
    dist_matrix = squareform(pdist(df_space.drop(columns=["ID", 'Base fee']).values, metric="euclidean"))
    df_dist = pd.DataFrame(dist_matrix, index=df_space['ID'], columns=df_space['ID'])
    
    return df_space, df_dist, dist_matrix, df_prices


df_space, df_dist, dist_matrix, df_prices = build_umap_and_distances()

In [7]:
def umap_space():
    """
    3D plot of the UMAP embedded space.
    """
    fig = px.scatter_3d(
        df_space,
        x="UMAP1", y="UMAP2", z="UMAP3",
        color='Base fee',
        hover_name="ID",
        color_continuous_scale="amp",
        opacity=0.85,
        title="Embedded UMAP Space",
        template="plotly_dark"
    )
    return fig

umap_space()

In [8]:
import gudhi as gd
distance_matrix = df_dist.values
rips_complex = gd.RipsComplex(distance_matrix=distance_matrix, max_edge_length=0.6)
simplex_tree = rips_complex.create_simplex_tree(max_dimension=3)

In [9]:
import plotly.graph_objects as go

def vietoris_rips_3d():
    """
    3D visualization of the Vietoris-Rips complex on the UMAP embedding.
    """
    coords = df_space[['UMAP1', 'UMAP2', 'UMAP3']].values
    ids = df_space['ID'].astype(str).tolist()

    # Build all 1-simplex edges
    edge_traces = []
    for simplex, _ in simplex_tree.get_skeleton(1):
        if len(simplex) == 2:
            i, j = simplex
            x0, y0, z0 = coords[i]
            x1, y1, z1 = coords[j]
            edge_traces.append(go.Scatter3d(
                x=[x0, x1, None],
                y=[y0, y1, None],
                z=[z0, z1, None],
                mode='lines',
                line=dict(color='white', width=1),
                hoverinfo='none',
                showlegend=False
            ))

    # Point cloud trace
    point_trace = go.Scatter3d(
        x=df_space['UMAP1'],
        y=df_space['UMAP2'],
        z=df_space['UMAP3'],
        mode='markers',
        marker=dict(
            size=4,
            color=df_space['Base fee'],
            opacity=0.85,
            colorscale='amp',
            cmin=40,
            cmax=120,
        ),
        text=ids,
        name='Points'
    )

    # Create the figure
    fig = go.Figure(data=[point_trace] + edge_traces)
    fig.update_layout(
        title='Vietoris-Rips Complex on UMAP Embedding',
        template='plotly_dark',
        scene=dict(
            xaxis_title='UMAP1',
            yaxis_title='UMAP2',
            zaxis_title='UMAP3'
        ),
    )
    return fig

vietoris_rips_3d()


In [10]:
from plotly.subplots import make_subplots

def rips_projections():
    """
    Show XY, XZ and YZ projections of the Vietoris-Rips complex on the UMAP embedding.
    """
    coords = df_space[["UMAP1", "UMAP2", "UMAP3"]].values
    ids = df_space["ID"].astype(str).tolist()

    # Prepare subplot grid and scatter plot parameters
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=("UMAP1 vs UMAP2", "UMAP1 vs UMAP3", "UMAP2 vs UMAP3"),
        horizontal_spacing=0.05
    )
    scatter_kwargs = dict(
        mode="markers",
        xaxis=None, yaxis=None,
        marker=dict(size=6, color=df_space["Base fee"], showscale=True, 
                    colorbar=dict(title="Base fee"), colorscale="amp", cmin=40, cmax=125),
        hovertext=ids,
        hoverinfo="text",
        showlegend=False 
    )

    # The three projection pairs
    dims = [(0,1), (0,2), (1,2)]
    for col, (i, j) in enumerate(dims, start=1):
        # Add the edges for the current projection
        for simplex, _ in simplex_tree.get_skeleton(1):
            if len(simplex) == 2:
                a, b = simplex
                fig.add_trace(
                    go.Scatter(
                        x=[coords[a,i], coords[b,i], None],
                        y=[coords[a,j], coords[b,j], None],
                        mode="lines",
                        line=dict(color="gray", width=1),
                        hoverinfo="none",
                        showlegend=False
                    ),
                    row=1, col=col
                )
        # Add the scatter points for the current projection
        fig.add_trace(
            go.Scatter(
                x=coords[:,i],
                y=coords[:,j],
                **scatter_kwargs
            ),
            row=1, col=col
        )

    fig.update_layout(
        title="Vietoris-Rips Complex Projections",
        template="plotly_dark",
        height=500, width=1200,
        #margin=dict(l=20, r=20, t=60, b=20)
    )
    for idx in range(1, 4):
        fig.update_xaxes(matches=None, row=1, col=idx)
        fig.update_yaxes(matches=None, row=1, col=idx)

    return fig

rips_projections()

In [11]:
def persistence_and_barcode(max_edge=1.0):
    """
    Compute Vietoris-Rips persistence and plot the persistence diagram and barcode.
    """
    # Build Rips complex and compute persistence up to dimension 2
    rips = gd.RipsComplex(distance_matrix=dist_matrix, max_edge_length=max_edge)
    st = rips.create_simplex_tree(max_dimension=4)
    st.compute_persistence()
    pairs = st.persistence()

    # Group (birth, death) by homology dimension
    dims = {}
    for dim, (b, d) in pairs:
        if d == float('inf'):
            d = max_edge
        dims.setdefault(dim, []).append((b, d))

    colors = {0: red, 1: "#ff657f", 2: "#fac9c9"}
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=("Persistence Diagram", "Barcode"),
        horizontal_spacing=0.1
    )

    # Persistence diagram
    for dim, bd in dims.items():
        births, deaths = zip(*bd)
        fig.add_trace(
            go.Scatter(
                x=births, y=deaths, mode="markers",
                marker=dict(color=colors.get(dim, "gray"), size=9, opacity=0.85),
                name=f"H{dim}", legendgroup=str(dim)
            ),
            row=1, col=1
        )
    # Diagonal
    fig.add_trace(
        go.Scatter(
            x=[0, max_edge], y=[0, max_edge], mode="lines",
            line=dict(color="white", dash="dash"), showlegend=False
        ),
        row=1, col=1
    )

    # Barcode
    y = 0
    for dim, bd in dims.items():
        for b, d in bd:
            fig.add_trace(
                go.Scatter(
                    x=[b, d], y=[y, y], mode="lines",
                    line=dict(color=colors.get(dim, "gray"), width=4),
                    showlegend=False
                ),
                row=1, col=2
            )
            y += 1

    fig.update_xaxes(title_text="Birth", row=1, col=1)
    fig.update_yaxes(title_text="Death", row=1, col=1)
    fig.update_xaxes(title_text="Filtration", row=1, col=2)
    fig.update_yaxes(visible=False, row=1, col=2)
    fig.update_layout(
        title="Persistent Homology: Diagram & Barcode",
        template="plotly_dark",
        width=1000, height=600,
    )
    return fig

persistence_and_barcode()

In [12]:
def betti_evolution(edge_lengths=np.linspace(0.1, 1.1, 100), max_dim=2):
    """
    Compute and plot Betti numbers (H₀, H₁, … H_max_dim) 
    for a Vietoris-Rips complex as the max-edge threshold varies.
    """
    # For each threshold, build Rips complex and read off its Betti numbers
    records = []
    for eps in edge_lengths:
        st = (
            gd.RipsComplex(distance_matrix=distance_matrix, max_edge_length=eps)
              .create_simplex_tree(max_dimension=max_dim + 1)
        )
        st.compute_persistence()
        bettis = st.betti_numbers()
        # pad with zeros if some dimensions are missing
        bettis += [0] * (max_dim + 1 - len(bettis))
        records.append((eps, *bettis[: max_dim + 1]))
    cols = ["epsilon"] + [f"H{d}" for d in range(max_dim + 1)]
    df = pd.DataFrame(records, columns=cols)
    df_long = df.melt(id_vars="epsilon", var_name="Homology dim", value_name="Count")

    # Create the line plot
    fig = px.line(
        df_long,
        x="epsilon",
        y="Count",
        color="Homology dim",
        title="Evolution of Betti Numbers vs. Rips Scale",
        labels={"epsilon": "Max edge length", "Count": "Betti count"},
        color_discrete_map={"H0": red, "H1": "#ff657f", "H2": "#d79c9c"},
        template="plotly_dark"
    )
    fig.update_layout(
        font=dict(color="white"),
        legend_title_text="k-th homology group",
    )
    return fig

betti_evolution()


In [13]:
def compute_volatility_features(spike_thresh=0.1):
    """
    For each listing (row), compute:
      - std_dev:  standard deviation of (price - mean)/mean
      - max_dev:  maximum absolute deviation
      - spike_freq: fraction of days with |dev| > spike_thresh
    Returns a DataFrame indexed by ID.
    """
    # Build deviation matrix
    mat = df_ts_interp.set_index("ID")[dates].astype(float)
    row_means = mat.mean(axis=1)
    dev = mat.sub(row_means, axis=0).div(row_means, axis=0)

    # Extract features
    std_dev    = dev.std(axis=1)
    max_dev    = dev.abs().max(axis=1)
    spike_freq = (dev.abs() > spike_thresh).sum(axis=1) / dev.shape[1]
    
    feats = pd.DataFrame({
        "std_dev":    std_dev,
        "max_dev":    max_dev,
        "spike_freq": spike_freq
    }).round(3)
    return feats

In [14]:
import numpy as np
import pandas as pd
import plotly.express as px

# 3. Multiparameter “persistence” via Betti₀ rank‐invariant
def bipersistence_heatmap(df_summary, n_steps=40):
    """
    Approximate a 2-parameter persistence surface β₀(p, v)
    """
    # compute β₀(p, v) as the count of listings with price_mean ≤ p and price_std ≤ v
    P = df_summary["price_mean"].values
    V = df_summary["price_std"].values
    p_grid = np.linspace(P.min(), P.max(), n_steps)
    v_grid = np.linspace(V.min(), V.max(), n_steps)
    comp_counts = np.zeros((len(v_grid), len(p_grid)), dtype=int)
    for i, v_thr in enumerate(v_grid):
        for j, p_thr in enumerate(p_grid):
            mask = (P <= p_thr) & (V <= v_thr)
            comp_counts[i, j] = mask.sum()

    # Create a heatmap
    df = pd.DataFrame(
        comp_counts, 
        index=np.round(v_grid,3), 
        columns=np.round(p_grid,3)
    )
    fig = px.imshow(
        df,
        labels=dict(x=f"price_mean ≤ p", y=f"price_std ≤ v", color="β₀ count"),
        x=df.columns, 
        y=df.index,
        title=f"Approximate β₀(p,v) — Price vs Volatility",
        aspect="auto",
        template="plotly_dark",
        color_continuous_scale="amp_r"
    )
    fig.update_xaxes(side="bottom")
    return fig


price_summary = df_prices.groupby("ID").apply(
    lambda g: pd.Series({
        "price_mean": g["Value"].mean(),
        "price_std":  g["Value"].std()
    })
)
dfa = price_summary.reset_index().merge(compute_volatility_features(), left_on="ID", right_index=True)
bipersistence_heatmap(dfa)

In [20]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
import gudhi as gd

def sliding_window_persistence_multi(listing_ids, embedding_dim=3, max_edge_length=1.4):
    """
    Compute sliding-window persistence diagrams for up to 9 listing IDs
    """
    n = len(listing_ids)
    cols = 3
    rows = int(np.ceil(n/cols))
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=listing_ids,
        horizontal_spacing=0.05, vertical_spacing=0.1
    )
    color_map = {0: "#ff4136", 1: "#0074D9", 2: "#2ECC40"}

    for idx, lid in enumerate(listing_ids):
        # Calculate row and column for the subplot
        r = idx//cols + 1
        c = idx%cols + 1    
        raw = df_ts_interp.loc[df_ts_interp["ID"] == lid, dates] \
                .values.flatten().astype(float)
        if len(raw) < embedding_dim:
            continue
        
        # Build the sliding window persistence diagram
        series = StandardScaler().fit_transform(raw.reshape(-1,1)).flatten()
        N = len(series) - embedding_dim + 1
        cloud = np.stack([series[i:i+embedding_dim] for i in range(N)])
        rips = gd.RipsComplex(points=cloud, max_edge_length=max_edge_length)
        st = rips.create_simplex_tree(max_dimension=3)
        st.compute_persistence()

        # Collect births/deaths by dimension
        dims = {}
        for d,(b,e) in st.persistence():
            e = e if e!=float("inf") else np.nanmax(series)
            dims.setdefault(d,[]).append((b,e))

        # Create figure
        for d,pts in dims.items():
            births, deaths = zip(*pts)
            fig.add_trace(
                go.Scatter(
                    x=births, y=deaths, mode="markers",
                    marker=dict(color=color_map.get(d,"gray"), size=6),
                    name=f"H{d}", legendgroup=f"dim{d}",
                    showlegend=(idx==0)
                ),
                row=r, col=c
            )
        mn, mx = 0, np.nanmax(series)
        fig.add_trace(
            go.Scatter(
                x=[mn,mx], y=[mn,mx], mode="lines",
                line=dict(color="white", dash="dash"),
                showlegend=False
            ),
            row=r, col=c
        )
        fig.update_xaxes(title_text="Birth", row=r, col=c)
        fig.update_yaxes(title_text="Death", row=r, col=c)

    fig.update_layout(
        title=f"Sliding‐Window Persistence (embed={embedding_dim}, edge≤{max_edge_length})",
        template="plotly_dark",
        height=300*rows, width=300*cols
    )
    return fig

# example usage:
listing_ids = [
    "865417719613815681","1364376551860961934","1316158964673335895",
    "1315344706047362032","1282890977555839180","1038158992459515932",
    "846281101302571252","897405006229683800","1315344706047362032"
]
sliding_window_persistence_multi(listing_ids)

In [16]:
df_prices.head()

Unnamed: 0,ID,Date,Value
0,1282890977555839180,2025-05-26,113.17
1,1255908021842197930,2025-05-26,114.5
2,1364376551860961934,2025-05-26,111.0
3,1279218191310817247,2025-05-26,117.5
4,1316158964673335895,2025-05-26,90.5
