In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from textblob import TextBlob
import warnings
import sys
import os

# Suppress warnings
warnings.filterwarnings('ignore')

# --- PART 1: DEFINE THE LOADER (So it always works) ---
class AirbnbLoader:
    def __init__(self, data_folder="../data"):
        self.data_path = Path(data_folder)
        
    def _get_file_path(self, filename):
        # Check unzipped
        if (self.data_path / f"{filename}.csv").exists():
            return self.data_path / f"{filename}.csv"
        # Check zipped
        if (self.data_path / f"{filename}.csv.gz").exists():
            return self.data_path / f"{filename}.csv.gz"
        raise FileNotFoundError(f"‚ùå Could not find {filename}.csv or {filename}.csv.gz in {self.data_path}")

    def load_data(self, sample_frac=None):
        print("üöÄ Starting Data Load...")
        
        # Load Listings (Needed for IDs)
        listings_path = self._get_file_path("listings")
        self.listings = pd.read_csv(listings_path)
        
        # Load Reviews (The main event)
        reviews_path = self._get_file_path("reviews")
        self.reviews = pd.read_csv(reviews_path, parse_dates=['date'])
        
        # Sampling
        if sample_frac:
            print(f"‚úÇÔ∏è Sampling {sample_frac*100}% for speed...")
            sampled_ids = self.listings['id'].sample(frac=sample_frac, random_state=42)
            self.listings = self.listings[self.listings['id'].isin(sampled_ids)]
            self.reviews = self.reviews[self.reviews['listing_id'].isin(sampled_ids)]

        print(f"‚úÖ Reviews Loaded: {self.reviews.shape}")
        return self.listings, None, self.reviews

# --- PART 2: LOAD THE DATA ---
print("‚è≥ Loading Data...")
# Initialize Loader
loader = AirbnbLoader(data_folder="../data")
# Load 10% sample
_, _, reviews = loader.load_data(sample_frac=0.1)

# --- PART 3: RUN TEXTBLOB ANALYSIS ---
print("‚è≥ Running Sentiment Analysis...")

# 1. Clean Data
reviews = reviews.dropna(subset=['comments'])

# 2. English Filter (Quick Hack)
def is_english(text):
    return len(set(str(text).lower().split()) & {'the', 'and', 'is'}) > 0

reviews = reviews[reviews['comments'].apply(is_english)].copy()

# 3. Calculate Polarity
reviews['polarity'] = reviews['comments'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# 4. Aggregate (Squeeze to 1 row per house)
sentiment_features = reviews.groupby('listing_id').agg({
    'polarity': ['mean', 'min', 'count']
}).reset_index()

# Flatten columns
sentiment_features.columns = ['listing_id', 'sentiment_avg', 'sentiment_min', 'review_count']

print("üèÜ SUCCESS! Here are your features:")
display(sentiment_features.head())

‚è≥ Loading Data...
üöÄ Starting Data Load...
‚úÇÔ∏è Sampling 10.0% for speed...
‚úÖ Reviews Loaded: (15520, 6)
‚è≥ Running Sentiment Analysis...
üèÜ SUCCESS! Here are your features:


Unnamed: 0,listing_id,sentiment_avg,sentiment_min,review_count
0,575725,0.380588,0.034375,123
1,659564,0.37198,0.131944,37
2,938572,0.348536,0.0,5
3,1354765,0.383225,-0.102857,88
4,1431086,0.372315,0.190707,14
