In [3]:
import dash
import dash_bootstrap_components as dbc
from dash import html, dcc, dash_table
import sqlite3
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import scipy.stats as stats
import plotly.express as px
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.figure_factory as ff

_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr = pd.read_sql_query("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts = pd.read_sql_query("SELECT * FROM TimeSeriesRaw", _conn, dtype={"ID": str})
df_ts_interp = pd.read_sql_query("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()

In [4]:
red = "#7e0d24"  # dark red color for plots

import re
dates = [col for col in df_ts.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", col)]
df_ts_interp = df_ts_interp.dropna(subset=dates, how="any").reset_index(drop=True)

In [5]:
import numpy as np
import pandas as pd
from ripser import ripser
from persim import plot_diagrams
import matplotlib.pyplot as plt

# assume df_ts_interp is your DataFrame loaded from SQL, and `dates` is your list of date-columns
# df_ts_interp: columns ["ID", "Name", <dates...>]

def sliding_window(series: np.ndarray, window_size: int, delay: int = 1) -> np.ndarray:
    """
    Build a delay-embedding of a 1D series.
    Returns an (N - (window_size-1)*delay) x window_size matrix.
    """
    n_points = len(series)
    m = window_size
    tau = delay
    L = n_points - (m - 1) * tau
    if L <= 0:
        raise ValueError("window_size too large for series length")
    X = np.empty((L, m))
    for i in range(L):
        X[i] = series[i : i + m*tau : tau]
    return X

def compute_listing_persistence(df_interp: pd.DataFrame,
                                dates: list[str],
                                window_size: int = 14,
                                delay: int = 1):
    """
    For each listing, compute its 1D & 2D persistence diagrams
    from a sliding-window embedding of its price series.
    Returns a dict: { listing_id: (dgms1, dgms2) }.
    """
    results = {}
    for _, row in df_interp.iterrows():
        series = row[dates].astype(float).values
        # mask any remaining NaNs by interpolation or dropping:
        if np.any(np.isnan(series)):
            series = pd.Series(series).interpolate(limit_direction='both').fillna(method='bfill').fillna(method='ffill').values
        X = sliding_window(series, window_size, delay)
        dgms = ripser(X, maxdim=2)['dgms']
        results[row["ID"]] = dgms
    return results

def compute_global_persistence(df_interp: pd.DataFrame, dates: list[str]):
    """
    Treat each day as a point in R^n (n = number of listings):
      X[i] = ( price_listing1_on_day_i, ..., price_listingN_on_day_i )
    Compute its persistent homology across time.
    Returns a pair of diagrams [H0, H1, H2].
    """
    # build day × listing matrix
    M = df_interp.set_index("ID")[dates].T.astype(float)
    # fill NaNs (if any)
    M = M.interpolate(axis=0, limit_direction='both').fillna(method='bfill').fillna(method='ffill')
    X = M.values  # shape (n_days, n_listings)
    dgms = ripser(X, maxdim=2)['dgms']
    return dgms

In [6]:
import numpy as np
import pandas as pd
from ripser import ripser
import plotly.graph_objects as go

def sliding_window(series: np.ndarray, window_size: int, delay: int = 1) -> np.ndarray:
    L = len(series) - (window_size - 1) * delay
    X = np.stack([series[i : i + L*delay : delay] for i in range(window_size)], axis=1)
    return X

def compute_listing_persistence(df_interp: pd.DataFrame,
                                dates: list[str],
                                window_size: int = 14,
                                delay: int = 1):
    """
    Returns dict: listing_id → [H0_diagram, H1_diagram, H2_diagram]
    """
    diagrams = {}
    for _, row in df_interp.iterrows():
        s = row[dates].astype(float).values
        # fill any NaNs
        s = pd.Series(s).interpolate().fillna(method='bfill').fillna(method='ffill').values
        X = sliding_window(s, window_size, delay)
        dgms = ripser(X, maxdim=2)['dgms']
        diagrams[row["ID"]] = dgms
    return diagrams

def compute_global_persistence(df_interp: pd.DataFrame, dates: list[str]):
    """
    Treat each day as a point in R^n and compute its persistence.
    """
    M = df_interp.set_index("ID")[dates].T.astype(float)
    M = M.interpolate(axis=0).fillna(method='bfill').fillna(method='ffill')
    dgms = ripser(M.values, maxdim=2)['dgms']
    return dgms

def plot_diagram_plotly(dgm: np.ndarray, dim: int, title: str) -> go.Figure:
    """
    Plot a single H_dim persistence diagram with Plotly.
    """
    births, deaths = dgm[:,0], dgm[:,1]
    # diagonal line for reference
    mn, mx = min(births.min(), deaths.min()), max(births.max(), deaths.max())
    fig = go.Figure([
        go.Scatter(x= births, y= deaths,
                   mode='markers',
                   marker=dict(size=6, color='crimson'),
                   name=f"H{dim}"),
        go.Scatter(x=[mn, mx], y=[mn, mx],
                   mode='lines',
                   line=dict(color='white', dash='dash'),
                   showlegend=False)
    ])
    fig.update_layout(
        template="plotly_dark",
        title=title,
        xaxis_title="Birth",
        yaxis_title="Death",
        xaxis=dict(range=[mn, mx]),
        yaxis=dict(range=[mn, mx]),
        width=500, height=500
    )
    return fig

In [7]:
listing_diagrams = compute_listing_persistence(df_ts_interp, dates, window_size=14, delay=1)
global_diagrams  = compute_global_persistence(df_ts_interp, dates)

# 2) plot example (in Dash you’d wrap these in dcc.Graph(..., figure=...))
#    a) first listing H0 & H1
first_id, (d0, d1, d2) = next(iter(listing_diagrams.items()))
plot_diagram_plotly(d0, 0, f"Listing {first_id} H₀")


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecat

In [14]:
import kmapper as km
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# 4.8.1 Mapper on Price vs. Volatility Filter
def mapper_price_volatility(df_attr, df_vol, filter_key="vol_max",
                            cover_n_cubes=10, cover_overlap=0.3,
                            cluster_eps=0.5, cluster_min_samples=5):
    """
    Runs Mapper on the combined attribute + volatility features,
    using `filter_key` (e.g. 'vol_max' or 'Price_mean') as lens.
    Returns a Plotly Figure of the Mapper graph.
    """
    # 1) build feature matrix X
    X_attr = df_attr.select_dtypes(include="number").copy()
    X_attr = X_attr.drop(columns=["latitude","longitude","Reviews","Base fee"], errors='ignore')
    X_vol = df_vol[["std_dev","max_dev","spike_freq"]]
    X = pd.concat([X_attr, X_vol], axis=1).values
    X = StandardScaler().fit_transform(X)
    # 2) lens = the chosen filter
    lens = df_vol[filter_key].values.reshape(-1,1)
    # 3) init mapper
    mapper = km.KeplerMapper(verbose=0)
    graph = mapper.map(
        lens,
        X,
        cover=km.Cover(n_cubes=cover_n_cubes, perc_overlap=cover_overlap),
        clusterer=DBSCAN(eps=cluster_eps, min_samples=cluster_min_samples)
    )
    # 4) extract nodes & edges
    nodes = graph["nodes"]
    edges = graph["links"]
    # 5) build scatter for nodes
    node_x, node_y = [], []
    for node_id, member_ids in nodes.items():
        # position node at mean of its members' lens value
        vals = lens[member_ids].flatten()
        node_x.append(vals.mean())
        node_y.append(len(member_ids))
    fig = go.Figure()
    # edges
    for u,v in edges:
        fig.add_trace(go.Scatter(
            x=[node_x[u], node_x[v]],
            y=[node_y[u], node_y[v]],
            mode="lines",
            line=dict(color="gray", width=1),
            hoverinfo="none",
            showlegend=False
        ))
    # nodes
    fig.add_trace(go.Scatter(
        x=node_x, y=node_y,
        mode="markers",
        marker=dict(size=[len(nodes[i])*2 for i in nodes],
                    color=[lens[members].mean() for members in nodes.values()],
                    colorscale="Hot",
                    showscale=True,
                    colorbar=dict(title=filter_key)),
        text=[f"Node {i}<br>size={len(nodes[i])}" for i in nodes],
        hoverinfo="text"
    ))
    fig.update_layout(
        title=f"Mapper Graph (filter={filter_key})",
        xaxis_title=filter_key,
        yaxis_title="Node size",
        template="plotly_dark"
    )
    return fig


# 4.8.2 Mapper on Spatial + Price/Amenity Filter
def mapper_spatial_price(df_attr, filter_key="Base fee",
                         cover_n_cubes=10, cover_overlap=0.3,
                         cluster_eps=0.5, cluster_min_samples=5):
    """
    Runs Mapper on (latitude,longitude) + selected price/amenity features,
    using `filter_key` (e.g. 'Base fee') as lens.
    """
    # build X = coords + numeric features
    coords = df_attr[["latitude","longitude"]].values
    feats = df_attr.select_dtypes(include="number").drop(columns=["latitude","longitude"], errors='ignore')
    X = np.hstack([coords, StandardScaler().fit_transform(feats)])
    # lens = the chosen filter (scaled)
    lens = StandardScaler().fit_transform(df_attr[[filter_key]].values)
    mapper = km.KeplerMapper(verbose=0)
    graph = mapper.map(
        lens,
        X,
        cover=km.Cover(n_cubes=cover_n_cubes, perc_overlap=cover_overlap),
        clusterer=DBSCAN(eps=cluster_eps, min_samples=cluster_min_samples)
    )
    nodes, edges = graph["nodes"], graph["links"]
    # position each node by mean latitude/longitude of its members
    node_x, node_y = [], []
    for members in nodes.values():
        latlons = coords[members]
        node_x.append(latlons[:,1].mean())  # lon
        node_y.append(latlons[:,0].mean())  # lat
    fig = go.Figure()
    # draw edges
    for u,v in edges:
        fig.add_trace(go.Scattermapbox(
            lon=[node_x[u], node_x[v]],
            lat=[node_y[u], node_y[v]],
            mode="lines",
            line=dict(color="gray", width=1),
            showlegend=False
        ))
    # draw nodes
    fig.add_trace(go.Scattermapbox(
        lon=node_x, lat=node_y,
        mode="markers",
        marker=dict(
            size=[len(nodes[i])*3 for i in nodes],
            color=[df_attr[filter_key].iloc[list(nodes[i])].mean() for i in nodes],
            colorscale="Hot_r",
            showscale=True,
            colorbar=dict(title=filter_key)
        ),
        text=[f"Node {i}<br>size={len(nodes[i])}" for i in nodes],
        hoverinfo="text"
    ))
    fig.update_layout(
        mapbox_style="carto-darkmatter",
        mapbox_center={"lat": coords[:,0].mean(), "lon": coords[:,1].mean()},
        mapbox_zoom=11,
        title="Spatial + Price/Amenity Mapper",
        #margin=dict(l=0,r=0,t40,b0),
        template="plotly_dark"
    )
    return fig


In [16]:

# 4.3.0 Compute Volatility Features
def compute_volatility_features(spike_thresh=0.1):
    """
    For each listing (row), compute:
      - std_dev:  standard deviation of (price - mean)/mean
      - max_dev:  maximum absolute deviation
      - spike_freq: fraction of days with |dev| > spike_thresh
    Returns a DataFrame indexed by ID.
    """
    # Build deviation matrix
    mat = df_ts_interp.set_index("ID")[dates].astype(float)
    row_means = mat.mean(axis=1)
    dev = mat.sub(row_means, axis=0).div(row_means, axis=0)

    # Extract features
    std_dev    = dev.std(axis=1)
    max_dev    = dev.abs().max(axis=1)
    spike_freq = (dev.abs() > spike_thresh).sum(axis=1) / dev.shape[1]
    
    feats = pd.DataFrame({
        "std_dev":    std_dev,
        "max_dev":    max_dev,
        "spike_freq": spike_freq
    }).round(3)
    return feats

def cluster_volatility(feats, k=4):
    """
    Fit KMeans on the features DataFrame and return
    a new DataFrame with a 'cluster' column (as string).
    """
    km = KMeans(n_clusters=k, random_state=0)
    labels = km.fit_predict(feats)
    df = feats.copy()
    df["cluster"] = labels.astype(str)
    return df

In [17]:
feats = compute_volatility_features()
df_vol = cluster_volatility(feats)
mapper_price_volatility(df_attr, df_vol,"vol_max")

KeyError: 'vol_max'

In [18]:
from tslearn.metrics import cdist_dtw
from ripser import ripser
from persim import plot_diagrams
import plotly.graph_objects as go

# 3.1 Build similarity graph & compute VR persistent homology
def networked_timeseries_persistence(dates, max_dim=1, thresh=None):
    """
    1) Compute pairwise DTW distances between each listing’s interpolated price series.
    2) Build a Vietoris–Rips filtration on that distance matrix (optionally thresholded).
    3) Return and plot the persistence diagrams (H0 & H1).
    """
    # 1) extract the matrix (listings × time)
    X = df_ts_interp[dates].astype(float).values
    # 2) compute DTW distance matrix (slow for N>200)
    D = cdist_dtw(X)  
    # optional threshold to sparsify?
    if thresh is not None:
        D = np.minimum(D, thresh)

    # 3) compute persistent homology
    diagrams = ripser(D, distance_matrix=True, maxdim=max_dim)['dgms']

    # 4) convert to Plotly figure
    fig = go.Figure()
    colors = ["#7e0d24","#dddddd"]
    for dim, dgm in enumerate(diagrams):
        # scatter of (birth, death)
        fig.add_trace(go.Scatter(
            x=dgm[:,0], y=dgm[:,1],
            mode="markers",
            marker=dict(color=colors[dim], size=6, opacity=0.8),
            name=f"H{dim}"
        ))
    # add diagonal reference
    lim = [0, np.nanmax([d.max() for d in diagrams])]
    fig.add_trace(go.Scatter(
        x=lim, y=lim, mode="lines",
        line=dict(color="gray", dash="dash"),
        showlegend=False
    ))
    fig.update_layout(
        title="Vietoris–Rips Persistence (DTW)",
        xaxis_title="Birth", yaxis_title="Death",
        template="plotly_dark",
        width=600, height=600
    )
    return fig, diagrams

# 3.2 Plot the resulting barcodes as well
def networked_timeseries_barcodes(diagrams):
    """
    Given ripser output `diagrams`, show barcodes for H0 and H1.
    """
    fig = go.Figure()
    y_off = 0
    height = 300
    for dim, dgm in enumerate(diagrams):
        for (b, d) in dgm:
            fig.add_trace(go.Bar(
                x=[d - b],
                y=[y_off],
                base=b,
                orientation='h',
                width=0.4,
                marker=dict(color="#7e0d24" if dim==1 else "#dddddd"),
                showlegend=(dim==0 and y_off==0),
                name=f"H{dim}" if y_off==0 else None,
            ))
            y_off += 1
        # leave a gap between dimensions
        y_off += 1
    fig.update_layout(
        title="Persistence Barcodes",
        xaxis_title="Filtration Value",
        yaxis=dict(showticklabels=False),
        template="plotly_dark",
        height=height + 20*y_off, width=800
    )
    return fig


In [19]:
diag_fig, diags = networked_timeseries_persistence(dates)
barcode_fig     = networked_timeseries_barcodes(diags)
diag_fig

In [21]:
diag_fig

In [25]:
def _build_attr_vol_matrix():
    # 1) Volatility metrics
    mat = df_ts_interp.set_index("ID")[dates].astype(float)
    row_means = mat.mean(axis=1)
    dev = mat.sub(row_means, axis=0).div(row_means, axis=0)
    vol_max = dev.abs().max(axis=1).rename("vol_max")
    vol_std = dev.std(axis=1).rename("vol_std")

    # 2) Numeric attributes (drop coords & ID)
    num = df_attr.set_index("ID").select_dtypes(include="number").drop(
        ["latitude", "longitude"], errors="ignore", axis=1
    )

    # 3) Merge
    X = pd.concat([num, vol_max, vol_std], axis=1).dropna()
    return X

In [26]:
from ripser import ripser
import numpy as np
import plotly.graph_objects as go

# 4.x.1 Compute & plot VR persistence on feature‐space
def feature_space_persistence(n_components=1, max_dim=1):
    """
    Build Vietoris–Rips filtration on standardized attribute+volatility features,
    compute persistent homology up to dim=max_dim, and return a Diagram plot.
    """
    # 1) Assemble & scale feature matrix
    X = _build_attr_vol_matrix().values  # uses your helper
    Xs = StandardScaler().fit_transform(X)

    # 2) Compute persistence (distance matrix = Euclidean by default)
    result = ripser(Xs, maxdim=max_dim)
    diagrams = result['dgms']

    # 3) Plot diagram(s)
    fig = go.Figure()
    colors = ["#7e0d24", "#dddddd"]
    for dim, dgm in enumerate(diagrams):
        fig.add_trace(go.Scatter(
            x=dgm[:,0], y=dgm[:,1],
            mode="markers",
            marker=dict(color=colors[dim], size=6, opacity=0.8),
            name=f"H{dim}"
        ))
    # diagonal
    extent = [0, np.nanmax([d.max() for d in diagrams])]
    fig.add_trace(go.Scatter(x=extent, y=extent,
                             mode="lines",
                             line=dict(color="gray", dash="dash"),
                             showlegend=False))
    fig.update_layout(
        title="Vietoris–Rips Persistence on Attribute+Volatility Space",
        xaxis_title="Birth", yaxis_title="Death",
        template="plotly_dark", width=600, height=600
    )
    return fig, diagrams

# 4.x.2 Plot barcodes for feature‐space
def feature_space_barcodes(diagrams):
    """
    Given the diagrams from feature_space_persistence, plot H0/H1 barcodes.
    """
    fig = go.Figure()
    y_off = 0
    for dim, dgm in enumerate(diagrams):
        for b, d in dgm:
            fig.add_trace(go.Bar(
                x=[d - b],
                y=[y_off],
                base=b,
                orientation='h',
                marker=dict(color="#7e0d24" if dim==1 else "#dddddd"),
                showlegend=(y_off==0),
                name=f"H{dim}"
            ))
            y_off += 1
        y_off += 1  # gap between dims
    fig.update_layout(
        title="Feature‐Space Persistence Barcodes",
        xaxis_title="Filtration Value",
        yaxis=dict(showticklabels=False),
        template="plotly_dark",
        height=200 + 20*y_off, width=800
    )
    return fig


In [27]:
diag_fig, diags = feature_space_persistence()
barcode_fig   = feature_space_barcodes(diags)
diag_fig


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul

