In [None]:

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# set project path to "dallas airbnb" folder
project_path = "/content/drive/MyDrive/dallas airbnb"

# change into that folder
os.chdir(project_path)
print("Now in:", project_path)
print("Files here:", os.listdir(project_path))

# load listings.csv if it exists
if "listings.csv" in os.listdir(project_path):
    listings = pd.read_csv("listings.csv")
    print("✅ Listings loaded:", listings.shape)
    display(listings.head())
else:
    print("⚠️ listings.csv not found in", project_path)


In [None]:
# drop unnecessary columns
listings_clean = listings.drop(
    columns=["name", "host_id", "host_name", "neighbourhood_group", "last_review", "license"],
    errors="ignore"  # ignore avoids errors if column missing
)

print("Remaining columns:", listings_clean.columns.tolist())
print("Shape:", listings_clean.shape)
listings_clean.head()


In [30]:
# Make a copy
df = listings_clean.copy()

# Fill categorical columns
df["neighbourhood"] = df["neighbourhood"].fillna("Unknown")
df["room_type"] = df["room_type"].fillna("Unknown")

# Fill numeric columns
numeric_fill_zero = ["availability_365", "number_of_reviews", "number_of_reviews_ltm", "reviews_per_month"]
for col in numeric_fill_zero:
    if col in df.columns:
        df[col] = df[col].fillna(0)

numeric_fill_median = ["minimum_nights", "calculated_host_listings_count", "price"]
for col in numeric_fill_median:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Drop rows with missing latitude/longitude (critical for mapping)
df = df.dropna(subset=["latitude", "longitude"])


# === DATA QUALITY REPORT (missing & duplicates & basic ranges) ===
print("Missing values per column:")
print(df.isnull().sum().sort_values(ascending=False))

print("\nDuplicate rows (full-row duplicates):", df.duplicated().sum())
if "id" in df.columns:
    print("Duplicate IDs:", df.duplicated(subset=["id"]).sum())

num_cols = ["price", "minimum_nights", "availability_365", "reviews_per_month"]
for c in num_cols:
    if c in df.columns:
        print(f"\n{c} -> min: {df[c].min()}, max: {df[c].max()}, mean: {df[c].mean():.2f}, median: {df[c].median():.2f}")

# === DATA QUALITY REPORT (missing & duplicates & basic ranges) ===
print("Missing values per column:")
print(df.isnull().sum().sort_values(ascending=False))

print("\nDuplicate rows (full-row duplicates):", df.duplicated().sum())
if "id" in df.columns:
    print("Duplicate IDs:", df.duplicated(subset=["id"]).sum())

num_cols = ["price", "minimum_nights", "availability_365", "reviews_per_month"]
for c in num_cols:
    if c in df.columns:
        print(f"\n{c} -> min: {df[c].min()}, max: {df[c].max()}, mean: {df[c].mean():.2f}, median: {df[c].median():.2f}")



# Summary of remaining missing values
missing_summary = df.isna().sum()
print("Remaining missing values per column:\n", missing_summary[missing_summary > 0])


Missing values per column:
id                                0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
dtype: int64

Duplicate rows (full-row duplicates): 0
Duplicate IDs: 0

price -> min: 9.0, max: 50000.0, mean: 510.25, median: 125.00

minimum_nights -> min: 1, max: 365, mean: 8.12, median: 2.00

availability_365 -> min: 0, max: 365, mean: 249.85, median: 299.00

reviews_per_month -> min: 0.0, max: 25.27, mean: 1.72, median: 1.01
Missing values per column:
id                                0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
p

In [13]:
# Remove leading "'" from latitude values and convert to float
df["latitude"] = df["latitude"].astype(str).str.lstrip("'").astype(float)

In [None]:
before = len(df)



# 1. Price (drop unrealistic values)
df = df[(df["price"] > 0) & (df["price"] <= 1000)]

# 2. Minimum nights (keep 1–365)
df = df[(df["minimum_nights"] >= 1) & (df["minimum_nights"] <= 365)]

# 3. Availability (valid days in a year: 0–365)
df = df[df["availability_365"].between(0, 365)]

# 4. Reviews per month (≥0 only)
df = df[df["reviews_per_month"] >= 0]

after = len(df)

print(f"Rows before filtering: {before}")
print(f"Rows after filtering:  {after}")
print(f"Dropped rows: {before - after}")

In [16]:
# Create binary column: 1 if Entire home/apt, else 0
df["is_entire_home"] = (df["room_type"] == "Entire home/apt").astype(int)

# Check distribution
#print(df["is_entire_home"].value_counts())

In [31]:
from sklearn.preprocessing import LabelEncoder

# initialize encoder
encoder = LabelEncoder()

# fit and transform neighbourhood column
df["neighbourhood_encoded"] = encoder.fit_transform(df["neighbourhood"])

# check results
print(df[["neighbourhood", "neighbourhood_encoded"]].head(10))

  neighbourhood  neighbourhood_encoded
0    District 2                      6
1   District 14                      5
2    District 9                     13
3   District 14                      5
4   District 11                      2
5   District 14                      5
6   District 14                      5
7   District 14                      5
8    District 2                      6
9   District 14                      5


In [32]:
print(df.head())

        id neighbourhood   latitude  longitude        room_type  price  \
0    61878    District 2  32.816900 -96.820180  Entire home/apt  111.0   
1   795703   District 14  32.803270 -96.809760  Entire home/apt  243.0   
2   860248    District 9  32.855090 -96.706250     Private room   44.0   
3  1154424   District 14  32.818684 -96.790154  Entire home/apt   92.0   
4  1277933   District 11  32.924490 -96.790620     Private room   55.0   

   minimum_nights  number_of_reviews  reviews_per_month  \
0              30                 57               0.32   
1              30                 71               0.47   
2               7                 72               0.67   
3               3                 97               0.66   
4               1                 30               0.30   

   calculated_host_listings_count  availability_365  number_of_reviews_ltm  \
0                               1               288                      2   
1                               1           

In [27]:
# === Multi-output regression (Price + Reviews_per_month) ===

import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- 1) Feature/Target ---
features = [
    "latitude","longitude","minimum_nights","number_of_reviews",
    "availability_365","calculated_host_listings_count",
    "neighbourhood","is_entire_home"
]

X = df[features].copy()
y = df[["price","reviews_per_month"]].astype(float).copy()

# --- 2) Preprocessing ---
categorical = ["neighbourhood"]
numeric = [c for c in features if c not in categorical]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical),
        ("num", "passthrough", numeric),
    ]
)

# --- 3) Regressor & Pipeline ---
regressor = RandomForestRegressor(
    n_estimators=300,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
multi_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(regressor))
])

# --- 4) Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 5) Fit ---
multi_model.fit(X_train, y_train)

# --- 6) Evaluate separately for each target ---
y_pred = multi_model.predict(X_test)

for i, target in enumerate(["price","reviews_per_month"]):
    mae  = mean_absolute_error(y_test.iloc[:,i], y_pred[:,i])
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:,i], y_pred[:,i]))
    r2   = r2_score(y_test.iloc[:,i], y_pred[:,i])
    print(f"--- {target.upper()} ---")
    print(f"MAE : {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R^2 : {r2:,.3f}\n")

# --- 7) Save model ---
os.makedirs("models", exist_ok=True)
joblib.dump(multi_model, "models/multi_price_review.pkl")
print("✅ Saved multi-output model to models/multi_price_review.pkl")
def predict_from_coords_multi(lat, lon, neighbourhood=None, is_entire_home=1):
    """
    Tek satır input oluşturur ve multi-output modelden (fiyat + reviews/month)
    tahmin döndürür.
    """
    # Feature order during the training
    _features_multi = [
        "latitude","longitude","minimum_nights","number_of_reviews",
        "availability_365","calculated_host_listings_count",
        "neighbourhood","is_entire_home"
    ]

    # Default değerler (eğitim setinden hesaplanmış medyan/mod)
    numeric_defaults = {
        "minimum_nights": df["minimum_nights"].median(),
        "number_of_reviews": df["number_of_reviews"].median(),
        "availability_365": df["availability_365"].median(),
        "calculated_host_listings_count": df["calculated_host_listings_count"].median(),
    }
    if neighbourhood is None:
        neighbourhood = df["neighbourhood"].mode().iloc[0]
    if is_entire_home is None:
        is_entire_home = 1   # default entire home

    # Tek satırlık DataFrame
    X_one = pd.DataFrame([{
        "latitude": float(lat),
        "longitude": float(lon),
        "minimum_nights": numeric_defaults["minimum_nights"],
        "number_of_reviews": numeric_defaults["number_of_reviews"],
        "availability_365": numeric_defaults["availability_365"],
        "calculated_host_listings_count": numeric_defaults["calculated_host_listings_count"],
        "neighbourhood": neighbourhood,
        "is_entire_home": is_entire_home,
    }], columns=_features_multi)

    # Tahmin yap
    price_pred, reviews_pm_pred = multi_model.predict(X_one)[0]
    return round(float(price_pred), 2), round(float(reviews_pm_pred), 3)


--- PRICE ---
MAE : 62.67
RMSE: 107.35
R^2 : 0.349

--- REVIEWS_PER_MONTH ---
MAE : 0.68
RMSE: 1.12
R^2 : 0.659

✅ Saved multi-output model to models/multi_price_review.pkl


In [29]:
price_est, rpm_est = predict_from_coords_multi(32.7767, -96.7970)
print("Predicted price:", price_est)
print("Predicted reviews/month:", rpm_est)

Predicted price: 151.24
Predicted reviews/month: 2.075
