In [None]:
# --- STEP 1: INSTALL & SETUP ---
!pip install kagglehub --quiet

import kagglehub
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# --- STEP 2: DOWNLOAD SEATTLE DATA ---
print("ðŸŒ² Downloading Seattle Airbnb Data...")
path = kagglehub.dataset_download("airbnb/seattle")
print(f"Dataset downloaded to: {path}")

# Find the listings.csv file
csv_file = os.path.join(path, "listings.csv")

# --- STEP 3: LOAD & CLEAN DATA ---
# Load data
df = pd.read_csv(csv_file)

# --- CRITICAL FIX: CLEAN THE PRICE COLUMN ---
# The error happened because prices looked like "$150.00" (String) instead of 150 (Number).
# We remove the '$' and ',' and convert to float.
if df['price'].dtype == 'object':
    df['price'] = df['price'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)

print("âœ… Prices converted to numbers successfully.")

# --- FIX COLUMN NAMES ---
# Seattle uses 'neighbourhood_cleansed' or 'neighbourhood_group_cleansed' often
if 'neighbourhood_group' not in df.columns:
    if 'neighbourhood_group_cleansed' in df.columns:
        df['neighbourhood_group'] = df['neighbourhood_group_cleansed']
    elif 'neighbourhood_cleansed' in df.columns:
        df['neighbourhood_group'] = df['neighbourhood_cleansed']
    else:
        # Last resort fallback
        df['neighbourhood_group'] = df['neighbourhood']

# Standard Cleaning
# Fill missing reviews
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# Filter for reasonable prices (Now this works because we fixed the column above!)
df = df[(df['price'] > 0) & (df['price'] < 500)]

print(f"âœ… Data Loaded & Cleaned: {df.shape[0]} listings ready.")

# --- STEP 4: RUN THE PRICING MODEL ---
print("\n--- ðŸ¤– Training Pricing AI ---")

# 1. Select Features
# We use: Location, Room Type, Minimum Nights, Review Count
features_to_use = ['neighbourhood_group', 'room_type', 'minimum_nights',
                   'number_of_reviews', 'reviews_per_month']

# 2. Filter dataset to only these columns + price
model_df = df[features_to_use + ['price']].dropna()

# 3. Convert Text to Numbers (One-Hot Encoding)
model_df = pd.get_dummies(model_df, columns=['neighbourhood_group', 'room_type'], drop_first=True)

# 4. Split Train/Test
X = model_df.drop('price', axis=1)
y = model_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(f"Model Accuracy (R2 Score): {r2_score(y_test, model.predict(X_test)):.2f}")
print("Top Influencers on Price:")
coeffs = pd.DataFrame(model.coef_, X.columns, columns=['Impact ($)'])
display(coeffs.sort_values(by='Impact ($)', ascending=False).head(5))

# --- STEP 5: SAVE FILE FOR DASHBOARD ---
df.to_csv('Seattle_Airbnb_Cleaned.csv', index=False)
print("\nðŸŽ‰ SUCCESS: 'Seattle_Airbnb_Cleaned.csv' is ready to download!")

ðŸŒ² Downloading Seattle Airbnb Data...
Using Colab cache for faster access to the 'seattle' dataset.
Dataset downloaded to: /kaggle/input/seattle
âœ… Prices converted to numbers successfully.
âœ… Data Loaded & Cleaned: 3786 listings ready.

--- ðŸ¤– Training Pricing AI ---
Model Accuracy (R2 Score): 0.35
Top Influencers on Price:


Unnamed: 0,Impact ($)
neighbourhood_group_Magnolia,30.89296
neighbourhood_group_West Seattle,17.513268
neighbourhood_group_Queen Anne,16.621757
neighbourhood_group_Downtown,11.018805
neighbourhood_group_Central Area,4.142575



ðŸŽ‰ SUCCESS: 'Seattle_Airbnb_Cleaned.csv' is ready to download!
