In [3]:
import dash
import dash_bootstrap_components as dbc
from dash import html, dcc, dash_table
import sqlite3
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import scipy.stats as stats
import plotly.express as px
import re
import random
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
import plotly.figure_factory as ff
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import pdist, squareform
import umap

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr = pd.read_sql_query("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts = pd.read_sql_query("SELECT * FROM TimeSeriesRaw", _conn, dtype={"ID": str})
df_ts_interp = pd.read_sql_query("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()

red = "#7e0d24"  # dark red color for plots
dates = [col for col in df_ts.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", col)]
df_ts_interp = df_ts_interp.dropna(subset=dates, how="any").reset_index(drop=True)

In [5]:
def build_umap_and_distances():
    # Drop some columns that are not needed
    df_temp = df_attr.copy()
    """
    df_temp=df_temp[[
    'Name', 'Host', 'Base fee', 'Cleaning fee', 'URL', 'ID', 'latitude',
    'longitude', 'Property type', 'Person capacity', 'accuracy_rating',
    'checking_rating', 'cleanliness_rating', 'communication_rating',
    'location_rating', 'value_rating', 'satisfaction_rating', 'Reviews',
    'Bedrooms', 'Beds', 'Baths', 'City skyline view', 'Beach view',
    'Sea/Lake view', 'Hot water', 'Jacuzzi', 'Shared pool', 'Shared gym',
    'Patio or balcony', 'Outdoor furniture', 'Outdoor playground',
    'Elevator', 'Carport', 'Dedicated workspace', 'AC', 'Heating', 'TV',
    'Cable TV', 'Wifi', 'Laundry service', 'Kitchen', 'Dining table',
    'Microwave', 'Dishes and silverware', 'Refrigerator', 'Stove', 'Keypad',
    'Washer', 'Pets allowed', 'Crib', 'Security cameras', 'Lock on door']]
    to_keep = [
        "Keypad", "Lock on door", "Smoke detector", "Security cameras", "AC", "Heating", 
        "Patio or balcony", "Stove", "Elevator", "Refrigerator", "Kitchen", "Wifi", 
        "TV", "Jacuzzi", "Carport", "Hot water", 
    ]
    df_temp = df_temp[["ID", "Base fee"] + to_keep]
    """
    df_temp = df_temp.iloc[:, :21]

    # Melt time series data to long format
    df_prices = (
        df_ts_interp.copy()
        .melt(id_vars="ID", value_vars=dates, var_name="Date", value_name="Value")
        .assign(Date=lambda d: pd.to_datetime(d["Date"], dayfirst=True))
    )

    # Summarize per‐listing log‐price mean, std, and trend
    def summarize(group):
        #y = np.log1p(group["Value"].replace(0, np.nan))  # Avoid log(0)
        y = np.log1p(group["Value"])
        #y = group["Value"]
        days = (group["Date"] - group["Date"].min()).dt.days.values.reshape(-1,1)
        lr = LinearRegression().fit(days, y) if len(np.unique(days))>1 else None
        return pd.Series({
            "price_mean": y.mean(),
            "price_std":  y.std(),
            "price_trend": lr.coef_[0] if lr else 0.0
        })
    df_price_summary = (
        df_prices
        .groupby("ID", group_keys=False)
        .apply(summarize)
    )
    df_merged = df_attr.merge(df_price_summary, on="ID")

    # Filter out near‐constant / low‐variance features
    selector = VarianceThreshold(threshold=0.1)
    X = selector.fit_transform(df_merged.select_dtypes("number"))
    to_keep = df_merged.select_dtypes("number").columns[selector.get_support()]

    # Drop highly correlated (>0.9)
    df_reduced = pd.DataFrame(X, columns=to_keep)
    corr = df_reduced.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape, dtype=bool), k=1))
    to_drop = [c for c in upper.columns if (upper[c] > 0.9).any()]
    df_space = df_reduced.drop(columns=to_drop)

    # Scale + UMAP embedding
    X_scaled = StandardScaler().fit_transform(df_space)
    umap_proj = umap.UMAP(n_components=3, n_neighbors=30, min_dist=0.1, random_state=69).fit_transform(X_scaled)

    # Build distance matrix and UMAP space DataFrame
    df_space = df_merged.loc[df_space.index, ['ID','Base fee']].reset_index(drop=True)
    df_space[['UMAP1','UMAP2','UMAP3']] = umap_proj
    dist_matrix = squareform(pdist(df_space.drop(columns=["ID", 'Base fee']).values, metric="euclidean"))
    df_dist = pd.DataFrame(dist_matrix, index=df_space['ID'], columns=df_space['ID'])
    
    return df_space, df_dist, dist_matrix, df_prices

In [7]:
df_space, df_dist, dist_matrix, df_prices = build_umap_and_distances()

# Model 1

In [None]:
import sqlite3, re
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import umap, gudhi as gd
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# 1) LOAD DATA FROM SQLITE
_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr      = pd.read_sql("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts_interp = pd.read_sql("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()
dates = [c for c in df_ts_interp.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", c)]
df_ts_interp = df_ts_interp.dropna(subset=dates).reset_index(drop=True)
df_attr = df_attr.set_index("ID").reindex(df_ts_interp["ID"].values).copy() # reindex attributes to match the time-series rows

# 2) PRICE‐TIER TARGET
df_attr["price_tier"] = pd.qcut(df_attr["Base fee"], 3, labels=["Low","Mid","High"])

# 3) TIME‐SERIES SUMMARY FEATURES
def make_ts(df, dates):
    rows = []
    for vals in df[dates].values.astype(float):
        y = np.log1p(vals)
        d = np.arange(len(y))
        coef = np.polyfit(d, y, 1)[0] if len(y)>1 else 0
        s = pd.Series(y)
        row = {
            "ts_mean":  y.mean(),
            "ts_std":   y.std(ddof=0),
            "ts_trend": coef,
            "ts_spike": (s.diff().abs()>0.1).sum() # type: ignore
        }
        for w in (7,14):
            r = s.rolling(w, min_periods=1).agg(["mean","std"]).iloc[-1]
            row[f"roll{w}_mean"], row[f"roll{w}_std"] = r["mean"], r["std"]
        rows.append(row)
    return pd.DataFrame(rows, index=df["ID"])
df_ts_feats = make_ts(df_ts_interp, dates)

# 4) UMAP “neighborhood” embedding on static + TS features
static_num  = df_attr.select_dtypes("number")
static_bool = df_attr.select_dtypes("bool").astype(int)
X_umap       = pd.concat([static_num, static_bool, df_ts_feats], axis=1).fillna(0)
proj         = umap.UMAP(n_components=2, random_state=42).fit_transform(X_umap)
df_attr["UMAP1"], df_attr["UMAP2"] = proj[:,0], proj[:,1] # type: ignore

# 5) Global TDA summaries on UMAP-distance matrix
umap_coords = df_attr[["UMAP1","UMAP2"]].values
dist_matrix = squareform(pdist(umap_coords, metric="euclidean"))
rips = gd.RipsComplex(distance_matrix=dist_matrix, # type: ignore
                             max_edge_length=dist_matrix.max()) 
st = rips.create_simplex_tree(max_dimension=2)
st.compute_persistence()
pers = st.persistence()

tda_feats = {"H0_sum_life":0.0, "H1_sum_life":0.0}
for dim,(b,d) in pers:
    life = (d if d!=float("inf") else dist_matrix.max()) - b
    if dim==0: tda_feats["H0_sum_life"] += life
    if dim==1: tda_feats["H1_sum_life"] += life

# broadcast across all listings
df_tda_feats = pd.DataFrame([tda_feats]*len(df_attr),
                            index=df_attr.index)

# 6) Assemble everything
df = df_attr.join(df_ts_feats).join(df_tda_feats)
num_cols = df.select_dtypes("number").columns
df[num_cols] = df[num_cols].fillna(0)
y        = df["price_tier"]
X_static = df[["Cleaning fee","Bedrooms","Baths","UMAP1","UMAP2"]]
X_ts     = df_ts_feats
X_tda    = df_tda_feats
X_amen   = df.select_dtypes("bool").astype(int)

# 7) Pipeline + model
pre = ColumnTransformer([
    ("num",  Pipeline([("impute",SimpleImputer()),("scale",StandardScaler())]),
     list(X_static)+list(X_ts)+list(X_tda)),
    ("amen", OneHotEncoder(drop="first"), X_amen.columns.tolist())
])
model = Pipeline([
    ("prep", pre),
    ("clf",  XGBClassifier(
        n_estimators=200, max_depth=4, learning_rate=0.05,
        use_label_encoder=False, eval_metric="mlogloss", random_state=0
    ))
])
le    = LabelEncoder()
y_enc = le.fit_transform(y)

# 8) 5‐fold CV
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
X_all  = pd.concat([X_static, X_ts, X_tda, X_amen], axis=1)
scores = cross_val_score(model, X_all, y_enc, cv=cv, scoring="accuracy")
print("5-fold CV accuracy:", np.round(scores,3))
print("Mean CV accuracy:", np.round(scores.mean(),3))

# 8) hyperparameter tuning with 5-fold CV
param_grid = {
    "clf__n_estimators":    [100, 200, 300],
    "clf__max_depth":       [3, 4, 6],
    "clf__learning_rate":   [0.01, 0.05, 0.1],
    "clf__subsample":       [0.7, 0.9, 1.0],
    "clf__colsample_bytree":[0.7, 0.9, 1.0],
}
grid = GridSearchCV(
    model,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=69),
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

grid.fit(X_all, y_enc)

print("Best CV score:", round(grid.best_score_, 3))
print("Best params:    ", grid.best_params_)

# 9) Evaluate tuned model on hold-out split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_all, y_enc, stratify=y_enc, test_size=0.25, random_state=69
)

best_model = grid.best_estimator_
best_model.fit(X_tr, y_tr)
y_pr = best_model.predict(X_te)

print("Hold-out accuracy:", round(accuracy_score(y_te, y_pr), 3))
print(classification_report(
    le.inverse_transform(y_te),
    le.inverse_transform(y_pr),
    target_names=le.classes_
))


5-fold CV accuracy: [0.75  0.625 0.625 0.938 0.533]
Mean CV accuracy: 0.694
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best CV score: 0.812
Best params:     {'clf__colsample_bytree': 0.7, 'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 100, 'clf__subsample': 0.9}
Hold-out accuracy: 0.95
              precision    recall  f1-score   support

        High       0.88      1.00      0.93         7
         Low       1.00      1.00      1.00         7
         Mid       1.00      0.83      0.91         6

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.96      0.95      0.95        20



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score
)

# 10) Save the model and metrics
y_pred = best_model.predict(X_te)
y_score = best_model.predict_proba(X_te)
report = classification_report(
    le.inverse_transform(y_te),
    le.inverse_transform(y_pred),
    target_names=le.classes_,
    output_dict=True
)
metrics_summary = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Value": [
        round(accuracy_score(y_te, y_pred), 3),
        round(report["weighted avg"]["precision"], 3), # type: ignore
        round(report["weighted avg"]["recall"], 3), # type: ignore
        round(report["weighted avg"]["f1-score"], 3) # type: ignore
    ]
}
metrics_df = pd.DataFrame(metrics_summary)
metrics_df.to_csv("models/model1/price_tier_metrics.csv", index=False)

# 11) Confusion matrix
cm = confusion_matrix(y_te, y_pred)
cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
cm_df.to_csv("models/model1/price_tier_confusion_matrix.csv")

# 12) Precision-Recall curves
pr_colors = ["#d79c9c", red, "#c71a37"]
pr_traces = []
for i, cls in enumerate(le.classes_):
    p, r, _ = precision_recall_curve((y_te==i).astype(int), y_score[:,i])
    ap = average_precision_score((y_te==i).astype(int), y_score[:,i])
    pr_traces.append(
        go.Scatter(
            x=r,
            y=p,
            mode="lines",
            name=f"{cls} (AP={ap:.2f})",
            line=dict(color=pr_colors[i % len(pr_colors)])
        )
    )
pr_layout = go.Layout(
    title="Precision–Recall Curves by Class",
    template="plotly_dark",
    xaxis_title="Recall",
    yaxis_title="Precision",
    legend=dict(bgcolor="rgba(0,0,0,0)")
)
pr_fig = go.Figure(data=pr_traces, layout=pr_layout)
pr_fig.write_json("models/model1/price_tier_pr_curves.json")


# Model 3

In [4]:
import sqlite3, re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import umap, gudhi as gd
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

# 1) LOAD DATA FROM SQLITE
_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr       = pd.read_sql("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts_interp  = pd.read_sql("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()
dates = [c for c in df_ts_interp.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", c)]
df_ts_interp = df_ts_interp.dropna(subset=dates).reset_index(drop=True)
df_attr      = df_attr.set_index("ID").reindex(df_ts_interp["ID"]).reset_index()

# 2) TARGET: average of all *_rating ≥4.8
rating_cols = [c for c in df_attr.columns if c.endswith("_rating")]
df_attr["is_superhost"] = (df_attr[rating_cols].mean(axis=1) >= 4.8).astype(int)

# 3) TIME‐SERIES SUMMARY FEATURES
def make_ts(df, dates):
    rows = []
    for vals in df[dates].values.astype(float):
        y = np.log1p(vals)
        d = np.arange(len(y))
        coef = np.polyfit(d, y, 1)[0] if len(y)>1 else 0
        s = pd.Series(y)
        r7  = s.rolling(7, 1).agg(["mean","std"]).iloc[-1]
        r14 = s.rolling(14,1).agg(["mean","std"]).iloc[-1]
        rows.append({
            "ts_mean": y.mean(), "ts_std": y.std(ddof=0), "ts_trend": coef,
            "roll7_mean": r7["mean"], "roll7_std": r7["std"],
            "roll14_mean": r14["mean"], "roll14_std": r14["std"],
            "ts_spikes": (s.diff().abs()>0.1).sum()
        })
    return pd.DataFrame(rows, index=df["ID"])
df_ts_feats = make_ts(df_ts_interp, dates)
df_ts_feats.index = df_attr.index

# 4) UMAP on static + ts
static_num  = df_attr.select_dtypes(include="number").drop(columns=["is_superhost"])
static_bool = df_attr.select_dtypes(include="bool").astype(int)
X_umap      = pd.concat([static_num, static_bool, df_ts_feats], axis=1).fillna(0)
proj        = umap.UMAP(n_components=2, random_state=69).fit_transform(X_umap)
df_attr["UMAP1"], df_attr["UMAP2"] = proj[:,0], proj[:,1] # type: ignore

# 5) TDA on price volatility window=14
def make_tda(df, dates, w=14):
    rows=[]
    for vals in df[dates].values.astype(float):
        N = len(vals)-w+1
        cloud = np.stack([vals[i:i+w] for i in range(N)])
        st = gd.RipsComplex(points=cloud, max_edge_length=vals.ptp()) \
               .create_simplex_tree(max_dimension=2)
        st.compute_persistence()
        h0,h1 = [],[]
        for dim,(b,e) in st.persistence():
            life = ((e if e != np.inf else vals.ptp()) - b)
            (h0 if dim==0 else h1).append(life)
        rows.append({
            "H0_max": max(h0, default=0), "H0_sum": sum(h0),
            "H1_max": max(h1, default=0), "H1_sum": sum(h1), "H1_cnt": len(h1)
        })
    return pd.DataFrame(rows, index=df["ID"])

df_tda_feats = make_tda(df_ts_interp, dates)

# 6) ASSEMBLE
df = df_attr.set_index("ID") \
    .join(df_ts_feats).join(df_tda_feats) \
    .fillna(0)
y = df["is_superhost"]
X_num  = df.select_dtypes(include="number").drop(columns=["is_superhost"])
X_amen = df.select_dtypes(include="bool").astype(int)
X_all  = pd.concat([X_num, X_amen], axis=1)

# 7) SPLIT
X_tr, X_te, y_tr, y_te = train_test_split(
    X_all, y, stratify=y, test_size=0.5, random_state=69)

# 8) PREPROCESSOR
pre = ColumnTransformer([
    ("num", Pipeline([("impute",SimpleImputer()),("scale",StandardScaler())]),
     X_num.columns),
    ("amen", OneHotEncoder(drop="first"), X_amen.columns)
])

# 9) Define ensemble
ensemble = VotingClassifier(
    estimators=[
        ("log", Pipeline([("prep", pre),
                          ("clf", LogisticRegression(penalty="l1", solver="saga",
                                                     max_iter=2000))])),
        ("rf",  Pipeline([("prep", pre),
                          ("clf", RandomForestClassifier(random_state=69))]))
    ],
    voting="soft"
)

# 10) Hyperparameter tunning with 5-fold CV
param_grid = {
    "log__clf__C": [0.01, 0.1, 1, 10],
    "rf__clf__n_estimators": [100, 200, 500],
    "rf__clf__max_depth": [None, 5, 10],
}
grid = GridSearchCV(
    ensemble,
    param_grid,
    cv=StratifiedKFold(5, shuffle=True, random_state=69),
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)
grid.fit(X_all, y)
print("Best CV score:", round(grid.best_score_, 3))
print("Best params:   ", grid.best_params_)

# 12) Evaluate on hold-out
best = grid.best_estimator_
best.fit(X_tr, y_tr)
y_pr = best.predict(X_te)
print("\n=== Tuned Ensemble ===")
print("Test accuracy :", round(accuracy_score(y_te, y_pr), 3))
print(classification_report(y_te, y_pr, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_te, y_pr))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best CV score: 0.962
Best params:    {'log__clf__C': 1, 'rf__clf__max_depth': None, 'rf__clf__n_estimators': 100}

=== Tuned Ensemble ===
Test accuracy : 0.95
              precision    recall  f1-score   support

           0      0.909     0.909     0.909        11
           1      0.966     0.966     0.966        29

    accuracy                          0.950        40
   macro avg      0.937     0.937     0.937        40
weighted avg      0.950     0.950     0.950        40

Confusion Matrix:
 [[10  1]
 [ 1 28]]


In [27]:
import pandas as pd
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc
import plotly.graph_objs as go

# 13) Save the model and metrics
y_pred = best.predict(X_te)
acc    = accuracy_score(y_te, y_pred)
class_names = ["Not Superhost", "Superhost"]
report = classification_report(
    y_te,
    y_pred,
    target_names=class_names,
    output_dict=True
)
metrics_summary = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Value": [
        round(acc, 3),
        round(report["weighted avg"]["precision"], 3), # type: ignore
        round(report["weighted avg"]["recall"],    3), # type: ignore
        round(report["weighted avg"]["f1-score"],  3), # type: ignore
    ]
}
metrics_df = pd.DataFrame(metrics_summary)
metrics_df.to_csv("models/model2/metrics_summary.csv", index=False)

# 14) Confusion matrix
cm    = confusion_matrix(y_te, y_pred)
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
cm_df.to_csv("models/model2/confusion_matrix.csv")

# 15) Feature importances (from RF inside the ensemble)
rf_pipeline = best.named_estimators_['rf']
importances = rf_pipeline.named_steps['clf'].feature_importances_
num_feats  = list(X_num.columns)
amen_feats = list(X_amen.columns)
all_feats  = num_feats + amen_feats
fi_df = (
    pd.DataFrame({"feature": all_feats, "importance": importances})
      .sort_values("importance", ascending=False)
)
#fi_df.to_csv("models/model2/feature_importances.csv", index=False)
# 16) Plot top 20 feature importances
top_n = fi_df.head(20)
fig   = px.bar(
    top_n,
    x="importance",
    y="feature",
    orientation="h",
    title="Top 20 Feature Importances (RF)",
    labels={"importance":"Importance", "feature":""},
    template="plotly_dark",
    color_discrete_sequence=[red]
)
fig.update_layout(yaxis=dict(autorange="reversed"), height=600)
fig.write_json("models/model2/feature_importances.json")

# 17) ROC curve for Superhost classification
y_score = best.predict_proba(X_te)[:, 1]   # probability of class “1”
y_true  = y_te                            # 0/1 labels
fpr, tpr, _ = roc_curve(y_true, y_score)
roc_auc     = auc(fpr, tpr)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode="lines",
    name=f"Superhost (AUC={roc_auc:.2f})",
    line=dict(color=red, width=3),
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode="lines",
    line=dict(color="white", width=1, dash="dash"),
    showlegend=False
))
fig.update_layout(
    title=f"ROC Curve (AUC = {roc_auc:.2f})",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    template="plotly_dark",
    legend=dict(bgcolor="rgba(0,0,0,0)"),
    height=400
)
fig.write_json("models/model2/roc_curve.json")

In [34]:
df_attr.select_dtypes("number").drop(columns=["Base fee", "latitude", "longitude"]).iloc[:, 13:]

Unnamed: 0,City skyline view,Beach view,Sea/Lake view,Hot water,Jacuzzi,Shared pool,Shared gym,Patio or balcony,Outdoor furniture,Outdoor playground,...,Room-darkening shades,Window guards,Security cameras,Lock on door,Keypad,Fire extinguisher,First aid kit,Smoke detector,Luggage dropoff,Self check-in
0,1,1,1,1,1,1,1,1,1,1,...,1,0,1,0,0,1,1,1,1,1
1,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,1,1,1,1,1,1,1,...,1,0,1,0,0,0,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,0,1,0,0,1,1,1,1,1
4,0,1,1,1,1,1,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,1,0,1,1,1,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,1
75,0,0,0,1,1,1,1,1,1,0,...,1,0,0,1,0,1,0,1,1,1
76,0,0,0,1,1,1,1,0,0,0,...,1,0,0,0,0,1,1,1,0,1
77,0,0,0,1,1,1,1,0,1,0,...,1,0,0,0,0,1,0,1,0,1


# Model 6

In [None]:
import sqlite3, re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans

# 1) Load & align
_conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attr      = pd.read_sql("SELECT * FROM Attributes", _conn, dtype={"ID": str})
df_ts_interp = pd.read_sql("SELECT * FROM TimeSeriesInterpolated", _conn, dtype={"ID": str})
_conn.close()
dates = [c for c in df_ts_interp.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", c)]
df_ts_interp = df_ts_interp.dropna(subset=dates).reset_index(drop=True)
#df_attr = df_attr.drop(columns=["Reviews"])
df_attr = df_attr.set_index("ID").reindex(df_ts_interp["ID"]).reset_index()

# 2) UNSUPERVISED SEGMENT LABELS
def cluster_by_amenities(k=4):
    """
    to_keep = [
        # list of amenity columns you want
        "Keypad","Lock on door","Smoke detector","Security cameras","AC","Heating",
        "Patio or balcony","Stove","Elevator","Refrigerator","Kitchen","Wifi",
        "TV","Jacuzzi","Carport","Hot water",
    ]
    """
    #am = df_attr[to_keep].astype(int).set_index(df_attr["ID"])
    am = df_attr.select_dtypes("number").set_index(df_attr["ID"])
    km = KMeans(n_clusters=k, random_state=0)
    labels = pd.Series(km.fit_predict(am), index=am.index)
    return labels
y = cluster_by_amenities(k=4).astype(int)

"""
# 3) Time‐series summary features (mean, std, trend, spikes)
def make_ts_feats(df, dates):
    out = []
    for vals in df[dates].values.astype(float):
        y = np.log1p(vals)
        days = np.arange(len(y))
        trend = np.polyfit(days, y, 1)[0] if len(y) > 1 else 0
        spikes = (pd.Series(y).diff().abs() > 0.1).sum() # type: ignore
        out.append({
            "ts_mean":    y.mean(),
            "ts_std":     y.std(ddof=0),
            "ts_trend":   trend,
            "ts_spikes":  spikes
        })
    return pd.DataFrame(out, index=df["ID"])
df_ts = make_ts_feats(df_ts_interp, dates)
df_ts.index = df_attr["ID"] # type: ignore

# 4) Build feature matrix
X_all_num = (df_attr.select_dtypes("number").drop(columns=["Base fee", "latitude", "longitude"])
             .set_index(df_attr["ID"]))
X = pd.concat([X_all_num, df_ts], axis=1).fillna(0)
"""
X = (df_attr.select_dtypes("number").drop(columns=["Base fee", "latitude", "longitude"]
    + [col for col in df_attr.columns if col.endswith("_rating")])
             .set_index(df_attr["ID"]))
# 5) Split and preprocess
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
preprocessor = ColumnTransformer([
    ("scale", StandardScaler(), X.columns.tolist()),
])

# 7) Define models
models = {
    "MultinomialLogistic": Pipeline([
        ("prep", preprocessor),
        ("clf", LogisticRegression(multi_class="multinomial", solver="saga", C=1.0, max_iter=1000))
    ]),
    "GradientBoosting": Pipeline([
        ("prep", preprocessor),
        ("clf", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=0))
    ]),
    "DecisionTree": Pipeline([
        ("prep", preprocessor),
        ("clf", DecisionTreeClassifier(max_depth=5, random_state=0))
    ]),
}

# 8) Build a soft‐voting ensemble of GradientBoosting + DecisionTree
ensemble = VotingClassifier(
    estimators=[
        ("gb", models["GradientBoosting"]),
        ("dt", models["DecisionTree"])
    ],
    voting="soft"
)

# 9) Hyperparameter tuning with 5-fold CV
param_grid = {
    # GradientBoosting params
    "gb__clf__n_estimators":   [100, 200, 300],
    "gb__clf__learning_rate":  [0.01, 0.05, 0.1],
    # DecisionTree params
    "dt__clf__max_depth":      [3, 5, 7, None],
    "dt__clf__min_samples_leaf":[1, 3, 5],
}
grid = GridSearchCV(
    estimator=ensemble,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
print("Best CV accuracy:", round(grid.best_score_, 3))
print("Best params:       ", grid.best_params_)

# 10) Evaluate the tuned ensemble on the hold-out split
best_ens = grid.best_estimator_
best_ens.fit(X_train, y_train)
y_pred = best_ens.predict(X_test)
print("\n=== Tuned Ensemble Evaluation ===")
print("Test accuracy:", round(accuracy_score(y_test, y_pred), 3))
print(classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best CV accuracy: 0.368
Best params:        {'dt__clf__max_depth': 3, 'dt__clf__min_samples_leaf': 5, 'gb__clf__learning_rate': 0.1, 'gb__clf__n_estimators': 300}

=== Tuned Ensemble Evaluation ===
Test accuracy: 0.375
              precision    recall  f1-score   support

           0      0.444     0.667     0.533         6
           1      0.250     0.250     0.250         4
           2      0.333     0.167     0.222         6

    accuracy                          0.375        16
   macro avg      0.343     0.361     0.335        16
weighted avg      0.354     0.375     0.346        16

Confusion Matrix:
 [[4 0 2]
 [3 1 0]
 [2 3 1]]


In [82]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.decomposition import PCA

# assume best_ens, X_test, y_test are already in scope

# 11) Predictions & raw metrics
y_pred = best_ens.predict(X_test)
acc    = accuracy_score(y_test, y_pred)
prec   = precision_score(y_test, y_pred, average="weighted", zero_division=0)
rec    = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1     = f1_score(y_test, y_pred, average="weighted", zero_division=0)

metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Value":  [round(acc,  3), # type: ignore
               round(prec, 3), # type: ignore
               round(rec,  3), # type: ignore
               round(f1,   3)] # type: ignore
})
os.makedirs("models/model3", exist_ok=True)
metrics_df.to_csv("models/model3/metrics_summary.csv", index=False)

# 12) Confusion matrix CSV
cm = confusion_matrix(y_test, y_pred)
n_classes = cm.shape[0]
idx = [f"True {i}" for i in range(n_classes)]
cols = [f"Pred {i}" for i in range(n_classes)]
cm_df = pd.DataFrame(cm, index=idx, columns=cols)
cm_df.to_csv("models/model3/confusion_matrix.csv")

# 13) Top-10 Feature Importances from GradientBoosting
gb_pipeline = best_ens.named_estimators_["gb"]        # the GB member
gb_model    = gb_pipeline.named_steps["clf"]
importances = gb_model.feature_importances_
feat_names  = X.columns.to_list()

fi_df = (
    pd.DataFrame({"feature": feat_names, "importance": importances})
      .sort_values("importance", ascending=False)
)
fi_df.head(10).to_csv("models/model3/feature_importances_top10.csv", index=False)

# 14) Plot Top-10 importances (vertical bars)
top10 = fi_df.head(10).sort_values("importance")[::-1]
fig_fi = px.bar(
    top10,
    x="feature",
    y="importance",
    orientation="v",
    title="Top 10 Feature Importances (GB)",
    labels={"feature": "Feature", "importance": "Importance"},
    template="plotly_dark",
    color_discrete_sequence=[red]
)
fig_fi.update_layout(xaxis_tickangle=-45, height=500)
fig_fi.write_json("models/model3/feature_importances.json")

# 15) Scatter of correctly vs incorrectly classified
#    Reduce X_test to 2D via PCA for visualization:
pca = PCA(n_components=2, random_state=0)
coords = pca.fit_transform(X_test)

viz_df = pd.DataFrame({
    "PC1": coords[:,0],
    "PC2": coords[:,1],
    "y_true": y_test.values,
    "y_pred": y_pred,
})
viz_df["correct"] = viz_df["y_true"] == viz_df["y_pred"]
viz_df["color"]   = viz_df["correct"].map({True: "Correct", False: "Wrong"})

fig_err = px.scatter(
    viz_df,
    x="PC1",
    y="PC2",
    color="color",
    symbol="color",
    title="2D PCA of Test Points: green=correct, red=misclassified",
    template="plotly_dark",
    color_discrete_map={"Correct":red, "Wrong":"#d79c9c"},
    symbol_map={"Correct":"circle","Wrong":"x"},
)
fig_err.update_traces(marker=dict(size=20, opacity=0.8))
fig_err.write_json("models/model3/classification_scatter.json")


In [83]:
import plotly.io as pio

def model3_feature_importances():
    """
    Load the precomputed feature importances bar chart for Model 3
    and apply the standard dark theme and sizing.
    """
    fig = pio.read_json("models/model3/feature_importances.json")
    fig.update_layout(
        title="Top 10 Feature Importances (Gradient Boosting)",
        template="plotly_dark",
        height=500,
        margin=dict(l=40, r=40, t=60, b=40)
    )
    # reverse y‐axis if horizontal bar
    if any(trace.orientation == "h" for trace in fig.data):
        fig.update_layout(yaxis=dict(autorange="reversed"))
    return fig

model3_feature_importances()

In [73]:
import plotly.io as pio

def model3_classification_scatter():
    """
    Load and return the 2D PCA scatter of test points,
    colored by correct vs. misclassified (from model3).
    """
    # Read the serialized figure
    fig = pio.read_json("models/model3/classification_scatter.json")
    
    # Tweak layout if needed
    fig.update_layout(
        title="2D PCA of Test Points: Correct vs. Misclassified",
        legend_title="Prediction",
        template="plotly_dark",
    )
    fig.update_traces(marker=dict(size=8, opacity=0.8))
    
    return fig

model3_classification_scatter()

# Model 7

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, hamming_loss
import tensorflow as tf
from tensorflow.keras import layers, models

# 1) BUILD FEATURE MATRIX X and MULTI-LABEL TARGET Y -----------------------

# static features from Attributes
static_feats = df_attr.set_index("ID")[
    ["Base fee", "Cleaning fee", "Bedrooms", "Beds", "Baths"] +
    [c for c in df_attr.columns if c.endswith("_rating")]
]

# volatility features
vol_feats = compute_volatility_features()

# UMAP coords
df_space, _, _, _ = build_umap_and_distances()
umap_feats = df_space.set_index("ID")[["UMAP1", "UMAP2", "UMAP3"]]

# price_time_series summaries
# (already computed in build_umap_and_distances as price_mean, price_std, price_trend)
# but if not, recompute quickly:
price_summary = (
    df_prices
    .groupby("ID")['Value']
    .agg(price_mean=lambda x: np.log1p(x).mean(),
         price_std = lambda x: np.log1p(x).std(),
         price_trend=lambda x: LinearRegression()
                              .fit(
                                  (pd.to_datetime(df_prices.loc[x.index,"Date"]) 
                                   - pd.to_datetime(df_prices.loc[x.index,"Date"]).min()
                                  ).dt.days.values.reshape(-1,1),
                                  np.log1p(x)
                              ).coef_[0]
        )
)

# combine all numeric features
X = (
    static_feats
    .join(vol_feats, how="inner")
    .join(umap_feats, how="inner")
    .join(price_summary, how="inner")
    .dropna()
)

# multi-label target: selected amenities one-hot
amenities = [
    "Wifi","Kitchen","Washer","Dryer","Air conditioning","Heating",
    "TV","Jacuzzi","Elevator","Patio or balcony"
]
Y = df_attr.set_index("ID")[amenities].loc[X.index].astype(int)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42
)


# 2) MODEL A: MultiOutput Random Forest --------------------------------------

rf = MultiOutputClassifier(
    Pipeline([
        ('scale', StandardScaler()),
        ('rf', RandomForestClassifier(n_estimators=200, random_state=0))
    ])
)
rf.fit(X_train, y_train)
y_pred_rf = pd.DataFrame(rf.predict(X_test), index=y_test.index, columns=y_test.columns)

print("== Random Forest MultiOutput ==")
print("Hamming loss:", hamming_loss(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, zero_division=0))


# 3) MODEL B: One-vs-Rest Logistic Regression -------------------------------

ovr = OneVsRestClassifier(
    Pipeline([
        ('scale', StandardScaler()),
        ('lr', LogisticRegression(solver='liblinear'))
    ])
)
ovr.fit(X_train, y_train)
y_pred_ovr = pd.DataFrame(ovr.predict(X_test), index=y_test.index, columns=y_test.columns)

print("== One-vs-Rest Logistic ==")
print("Hamming loss:", hamming_loss(y_test, y_pred_ovr))
print(classification_report(y_test, y_pred_ovr, zero_division=0))


# 4) MODEL C: Deep MLP with Sigmoid Outputs ---------------------------------

# build a simple MLP
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

mlp = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(output_dim, activation='sigmoid')
])
mlp.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')]
)

# scale data
scaler = StandardScaler().fit(X_train)
Xtr = scaler.transform(X_train)
Xte = scaler.transform(X_test)

# train
mlp.fit(Xtr, y_train.values, 
        validation_split=0.1, epochs=20, batch_size=32, verbose=1)

# predict (threshold at 0.5)
y_prob = mlp.predict(Xte)
y_pred_mlp = (y_prob > 0.5).astype(int)
y_pred_mlp = pd.DataFrame(y_pred_mlp, index=y_test.index, columns=y_test.columns)

print("== Deep MLP ==")
print("Hamming loss:", hamming_loss(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp, zero_division=0))


# Extra

In [89]:
from datetime import datetime

conn = sqlite3.connect("airbnb_cartagena.sqlite")
df_attributes = pd.read_sql_query("SELECT * FROM Attributes", conn)
df_time_series = pd.read_sql_query("SELECT * FROM TimeSeriesInterpolated", conn)
conn.close()

to_keep = [
    "Keypad", "Lock on door", "Smoke detector", "Security cameras", "AC", "Heating", 
    "Patio or balcony", "Stove", "Elevator", "Refrigerator", "Kitchen", "Wifi", 
    "TV", "Jacuzzi", "Carport", "Hot water", 
]
df_attributes = df_attributes[to_keep + ["ID"]]


columnas_fecha = []
for col in df_time_series.columns:
    try:
        datetime.strptime(col.strip(), "%d/%m/%Y")
        columnas_fecha.append(col)
    except ValueError:
        continue


df_prices = df_time_series.melt(
    id_vars=["ID"],
    value_vars=columnas_fecha,
    var_name="Date",
    value_name="Value"
)
df_prices["Date"] = pd.to_datetime(df_prices["Date"])
ids_con_nan = df_prices[df_prices["Value"].isna()]["ID"].unique()

df_prices = df_prices[~df_prices["ID"].isin(ids_con_nan)]
df_attributes = df_attributes[~df_attributes["ID"].isin(ids_con_nan)]







df_num = df_attributes.select_dtypes(include=[np.number]).drop(columns=["ID"])

def summarize_prices(df_prices):
    df_prices["log_price"] = np.log1p(df_prices["Value"])
    summary = df_prices.groupby("ID")["log_price"].agg([
        'mean', 'std'
    ]).rename(columns={"mean": "price_mean", "std": "price_std"})

    df_prices['day'] = (df_prices['Date'] - df_prices['Date'].min()).dt.days
    trends = []
    for _, group in df_prices.groupby('ID'):
        X = group['day'].values.reshape(-1, 1)
        y = np.log1p(group['Value'].values)
        slope = LinearRegression().fit(X, y).coef_[0] if len(np.unique(X)) > 1 else 0.0
        trends.append(slope)
    summary['price_trend'] = trends
    return summary

df_price_summary = summarize_prices(df_prices)
df_merged = df_attributes.merge(df_price_summary, left_on="ID", right_index=True)


df_final_num = df_merged.select_dtypes(include=[np.number]).drop(columns=["ID"])



scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_final_num)

umap_model = umap.UMAP(n_components=3, n_neighbors=30, min_dist=0.1, random_state=69)  
X_umap = umap_model.fit_transform(X_scaled)
print(f"UMAP completado")

ids = df_attributes["ID"].values
df_space = pd.DataFrame(X_umap, columns=[f"UMAP{i+1}" for i in range(X_umap.shape[1])], index=ids)

df_space["ID"] = df_space.index

dist_matrix = squareform(pdist(df_space.drop(columns="ID").values, metric='euclidean'))
df_dist = pd.DataFrame(dist_matrix, index=df_space["ID"], columns=df_space["ID"])
print(" Matriz de distancias calculada")


# Crear DataFrame con columnas originales estandarizadas
df_scaled = pd.DataFrame(X_scaled, columns=df_final_num.columns, index=df_space['ID'])

# Concatenar con componentes UMAP
df_umap_all = pd.concat([df_scaled, df_space.drop(columns="ID")], axis=1)

# Calcular correlación
correlation = df_umap_all.corr().loc[df_final_num.columns, ['UMAP1', 'UMAP2', 'UMAP3']]

# Filtrar las 10 variables con mayor correlación absoluta
correlation['max_corr'] = correlation.abs().max(axis=1)
top10 = correlation.sort_values('max_corr', ascending=False).drop(columns='max_corr').head(10)

# Convertir a formato largo para plotly
correlation_melted = top10.reset_index().melt(id_vars='index', var_name='UMAP', value_name='Correlation')
correlation_melted.rename(columns={'index': 'Variable'}, inplace=True)

# Crear heatmap interactivo con Plotly
fig = px.imshow(
    top10.values,
    x=top10.columns,
    y=top10.index,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    text_auto=".2f",
    labels=dict(x="UMAP", y="Variable", color="Correlación")
)

fig.update_layout(
    title="Top 10 variables más correlacionadas con UMAP1, UMAP2 y UMAP3",
    xaxis_title="Componente UMAP",
    yaxis_title="Variable original",
    height=400 + 20 * len(top10)
)

fig.show()

UMAP completado
 Matriz de distancias calculada
