In [17]:
# Airbnb Smart Pricing Engine - Complete Training Pipeline
# Combines data processing, model training, and export for Streamlit

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer, QuantileTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import skew
import warnings
warnings.filterwarnings('ignore')
import pickle
import json
import os
from datetime import datetime
import re
import joblib

# Text processing imports
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import shap

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("üöÄ Starting Airbnb Smart Pricing Engine Training")
print("=" * 60)

# ==============================================================================
# 1. SETUP PATHS AND DIRECTORIES
# ==============================================================================

# Set up paths for organized project structure
project_root = os.path.dirname(os.getcwd())  # Go up one level from notebooks/
data_dir = os.path.join(project_root, "data")
models_dir = os.path.join(project_root, "models")

print(f"üìÅ Project root: {project_root}")
print(f"üìä Data directory: {data_dir}")
print(f"ü§ñ Models directory: {models_dir}")

# Ensure models directory exists
os.makedirs(models_dir, exist_ok=True)
os.makedirs('model_artifacts', exist_ok=True)

# ==============================================================================
# 2. UTILITY CLASSES AND FUNCTIONS
# ==============================================================================

def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def get_sentiment_features(texts, max_features=100):
    """Extract sentiment and text features using TF-IDF"""
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2
    )
    
    cleaned_texts = [clean_text(text) for text in texts]
    tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
    svd = TruncatedSVD(n_components=min(20, max_features))
    reduced_features = svd.fit_transform(tfidf_matrix)
    
    return reduced_features, vectorizer, svd

class DistilBertTextEncoder:
    """Text encoder using DistilBERT for review processing"""
    def __init__(self, max_length=256, batch_size=8):
        self.max_length = max_length
        self.batch_size = batch_size
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.model.eval()
    
    def encode_texts(self, texts):
        """Encode texts to embeddings"""
        all_embeddings = []
        
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]
            
            # Tokenize
            inputs = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            # Get embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # CLS token
            
            all_embeddings.append(embeddings)
        
        return np.vstack(all_embeddings)

class ExplainableMultimodalRegressor:
    """Multimodal regressor combining tabular and text data with explanations"""
    def __init__(self, tabular_model, text_encoder, meta_model):
        self.tabular_model = tabular_model
        self.text_encoder = text_encoder
        self.meta_model = meta_model
        self.explainer = None
        self.feature_names = None
        
    def fit(self, X_tabular, X_text, y):
        """Fit the multimodal model"""
        # Fit tabular model
        self.tabular_model.fit(X_tabular, y)
        self.feature_names = X_tabular.columns.tolist()
        
        # Get tabular predictions
        tabular_preds = self.tabular_model.predict(X_tabular)
        
        # Encode text
        text_embeddings = self.text_encoder.encode_texts(X_text)
        
        # Combine features for meta-learner
        combined_features = np.column_stack([
            tabular_preds.reshape(-1, 1),
            text_embeddings
        ])
        
        # Fit meta-model
        self.meta_model.fit(combined_features, y)
        
        return self
    
    def predict(self, X_tabular, X_text):
        """Make predictions using both tabular and text data"""
        # Get tabular predictions
        tabular_preds = self.tabular_model.predict(X_tabular)
        
        # Encode text
        text_embeddings = self.text_encoder.encode_texts(X_text)
        
        # Combine features
        combined_features = np.column_stack([
            tabular_preds.reshape(-1, 1),
            text_embeddings
        ])
        
        # Meta-model prediction
        return self.meta_model.predict(combined_features)
    
    def score(self, X_tabular, X_text, y):
        """Calculate R¬≤ score"""
        predictions = self.predict(X_tabular, X_text)
        return r2_score(y, predictions)
    
    def get_feature_importance(self):
        """Get feature importance from tabular model"""
        if hasattr(self.tabular_model, 'feature_importances_'):
            return self.tabular_model.feature_importances_
        return None
    
    def explain_prediction(self, X_single, text_single):
        """Explain a single prediction"""
        explanations = {}
        
        try:
            # Convert Series to DataFrame if needed
            if isinstance(X_single, pd.Series):
                X_df = X_single.to_frame().T
            else:
                X_df = X_single if isinstance(X_single, pd.DataFrame) else pd.DataFrame([X_single], columns=self.feature_names)
            
            # Get predictions
            tabular_pred = self.tabular_model.predict(X_df)[0]
            final_pred = self.predict(X_df, [text_single])[0]
            
            explanations['predictions'] = {
                'tabular_prediction': float(tabular_pred),
                'final_prediction': float(final_pred),
                'text_contribution': float(final_pred - tabular_pred)
            }
            
            # Feature importance fallback if no SHAP
            if self.explainer is None:
                feature_importance = self.get_feature_importance()
                if feature_importance is not None and self.feature_names:
                    explanations['tabular'] = dict(zip(self.feature_names, feature_importance))
                else:
                    explanations['tabular'] = {}
            else:
                # Use SHAP if available
                shap_values = self.explainer(X_df)
                if hasattr(shap_values, 'values'):
                    feature_importance = shap_values.values[0]
                else:
                    feature_importance = shap_values[0]
                explanations['tabular'] = dict(zip(self.feature_names, feature_importance))
            
        except Exception as e:
            explanations['error'] = str(e)
            explanations['tabular'] = {}
            explanations['predictions'] = {}
        
        return explanations

def convert_to_json_serializable(obj):
    """Convert numpy types to Python native types for JSON serialization"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_json_serializable(v) for v in obj]
    else:
        return obj

# ==============================================================================
# 3. DATA LOADING AND PREPROCESSING
# ==============================================================================

print("üì• Loading data...")

# Load data from organized structure
listings_path = os.path.join(data_dir, 'listings.csv')
reviews_path = os.path.join(data_dir, 'reviews.csv')

df = pd.read_csv(listings_path)
reviews_df = pd.read_csv(reviews_path)

print(f"‚úÖ Loaded {len(df)} listings and {len(reviews_df)} reviews")

# Aggregate review data by listing_id
review_aggregated = reviews_df.groupby('listing_id').agg({
    'comments': lambda x: ' '.join(x.dropna().astype(str)) if len(x.dropna()) > 0 else '',
    'id': 'count'
}).reset_index()
review_aggregated.columns = ['id', 'combined_reviews', 'review_count']

# Merge with listings data
df = df.merge(review_aggregated, on='id', how='left')
df['combined_reviews'] = df['combined_reviews'].fillna('')
df['review_count'] = df['review_count'].fillna(0)

print(f"‚úÖ After merging: {len(df)} listings with review data")

# ==============================================================================
# 4. FEATURE ENGINEERING
# ==============================================================================

print("üîß Engineering features...")

# Clean price data
df['price_clean'] = df['price'].replace(r'[\$,]', '', regex=True)
df['price_clean'] = pd.to_numeric(df['price_clean'], errors='coerce')
df = df.dropna(subset=['price_clean'])

# Remove outliers
Q1 = df['price_clean'].quantile(0.25)
Q3 = df['price_clean'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 3 * IQR
df = df[(df['price_clean'] >= lower_bound) & (df['price_clean'] <= upper_bound)]

# Apply log transformation if skewed
y_skewness = skew(df['price_clean'].dropna())
if abs(y_skewness) > 1:
    df['price_clean'] = np.log1p(df['price_clean'])

# Create derived features
if 'accommodates' in df.columns:
    df['price_per_person'] = df['price_clean'] / df['accommodates'].replace(0, 1)

if 'bedrooms' in df.columns and 'beds' in df.columns:
    df['beds_per_bedroom'] = df['beds'] / df['bedrooms'].replace(0, 1)

if 'bathrooms_text' in df.columns:
    df['bathrooms_numeric'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)').astype(float)

if 'neighbourhood_cleansed' in df.columns:
    neighbourhood_counts = df['neighbourhood_cleansed'].value_counts()
    df['neighbourhood_popularity'] = df['neighbourhood_cleansed'].map(neighbourhood_counts)

if 'host_is_superhost' in df.columns:
    df['is_superhost_numeric'] = (df['host_is_superhost'] == 't').astype(int)

if 'amenities' in df.columns:
    df['amenities_count'] = df['amenities'].str.count(',') + 1
    df['amenities_count'] = df['amenities_count'].fillna(0)
    
    # Key amenities
    key_amenities = ['wifi', 'kitchen', 'parking', 'pool']
    for amenity in key_amenities:
        df[f'has_{amenity}'] = df['amenities'].str.lower().str.contains(amenity, na=False).astype(int)

if 'availability_365' in df.columns:
    df['availability_rate'] = df['availability_365'] / 365

# Define feature sets
numerical_features = [
    'accommodates', 'bedrooms', 'beds', 'bathrooms_numeric',
    'price_per_person', 'beds_per_bedroom', 'neighbourhood_popularity',
    'is_superhost_numeric', 'amenities_count', 'minimum_nights', 'maximum_nights',
    'availability_365', 'availability_rate', 'number_of_reviews', 'review_scores_rating',
    'calculated_host_listings_count'
] + [f'has_{amenity}' for amenity in key_amenities]

categorical_features = ['neighbourhood_cleansed', 'room_type', 'property_type']

# Filter existing columns
numerical_features = [col for col in numerical_features if col in df.columns]
categorical_features = [col for col in categorical_features if col in df.columns]

# Create feature matrix
X = df[numerical_features + categorical_features].copy()
y = df['price_clean'].copy()

# Additional engineered features
if 'accommodates' in X.columns and 'bedrooms' in X.columns:
    X['space_ratio'] = X['accommodates'] / (X['bedrooms'].replace(0, 1))
    X['space_efficiency'] = X['accommodates'] / (X['bedrooms'].replace(0, 1) + 1)

if 'number_of_reviews' in X.columns and 'review_scores_rating' in X.columns:
    X['review_velocity'] = X['number_of_reviews'] / 100
    X['review_quality_weighted'] = X['number_of_reviews'] * X['review_scores_rating'] / 100

if 'latitude' in df.columns and 'longitude' in df.columns:
    city_lat, city_lon = df['latitude'].median(), df['longitude'].median()
    X['distance_from_center'] = np.sqrt((df['latitude'] - city_lat)**2 + (df['longitude'] - city_lon)**2)

if 'name' in df.columns:
    X['name_length'] = df['name'].str.len().fillna(0)
    luxury_words = ['luxury', 'deluxe', 'premium', 'exclusive', 'elegant']
    X['has_luxury_words'] = df['name'].str.lower().str.contains('|'.join(luxury_words), na=False).astype(int)

if 'amenities' in df.columns:
    premium_amenities = ['pool', 'hot tub', 'gym', 'elevator', 'doorman', 'concierge']
    X['premium_amenities_count'] = sum(df['amenities'].str.lower().str.contains(amenity, na=False).astype(int) for amenity in premium_amenities)

if 'host_since' in df.columns:
    df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
    X['host_experience_years'] = (pd.Timestamp.now() - df['host_since']).dt.days / 365
    X['host_experience_years'] = X['host_experience_years'].fillna(0)

# Handle missing values
numerical_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

for col in numerical_cols:
    if X[col].isnull().any():
        X[col] = X[col].fillna(X[col].median())

for col in categorical_cols:
    if X[col].isnull().any():
        X[col] = X[col].fillna('Unknown')

# Reset indices
df = df.reset_index(drop=True)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

print(f"‚úÖ Feature engineering complete. Shape: {X.shape}")

# ==============================================================================
# 5. TRAIN-TEST SPLIT AND NEIGHBORHOOD ENCODING
# ==============================================================================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add neighborhood-based features
if 'neighbourhood_cleansed' in X_train.columns:
    train_with_target = X_train.copy()
    train_with_target['target'] = y_train
    neighborhood_stats = train_with_target.groupby('neighbourhood_cleansed')['target'].agg(['mean', 'std']).reset_index()
    neighborhood_stats.columns = ['neighbourhood_cleansed', 'neighborhood_avg_price', 'neighborhood_price_std']
    
    X_train = X_train.merge(neighborhood_stats, on='neighbourhood_cleansed', how='left')
    X_test = X_test.merge(neighborhood_stats, on='neighbourhood_cleansed', how='left')
    
    overall_median = neighborhood_stats['neighborhood_avg_price'].median()
    overall_std = neighborhood_stats['neighborhood_price_std'].median()
    
    X_train['neighborhood_avg_price'] = X_train['neighborhood_avg_price'].fillna(overall_median)
    X_train['neighborhood_price_std'] = X_train['neighborhood_price_std'].fillna(overall_std)
    X_test['neighborhood_avg_price'] = X_test['neighborhood_avg_price'].fillna(overall_median)
    X_test['neighborhood_price_std'] = X_test['neighborhood_price_std'].fillna(overall_std)

# ==============================================================================
# 6. PREPROCESSING PIPELINE  
# ==============================================================================

numerical_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('power', PowerTransformer(method='yeo-johnson')),
            ('quantile', QuantileTransformer(n_quantiles=min(len(X_train), 500), random_state=42))
        ]), numerical_cols.tolist()),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), categorical_cols.tolist())
    ]
)

# ==============================================================================
# 7. MODEL TRAINING
# ==============================================================================

print("ü§ñ Training ensemble models...")

# Define models
models = {
    'ExtraTreesUltra': ExtraTreesRegressor(
        n_estimators=500, max_depth=25, min_samples_split=2, 
        min_samples_leaf=1, max_features='sqrt', bootstrap=True, 
        oob_score=True, random_state=42, n_jobs=-1
    ),
    'GradientBoostingUltra': GradientBoostingRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=7, 
        min_samples_split=10, min_samples_leaf=4, subsample=0.8, 
        max_features='sqrt', random_state=42
    ),
    'RandomForestUltra': RandomForestRegressor(
        n_estimators=500, max_depth=30, min_samples_split=5, 
        min_samples_leaf=2, max_features='sqrt', bootstrap=True, 
        oob_score=True, random_state=42, n_jobs=-1
    )
}

# Train individual models
trained_models = {}
for name, model in models.items():
    pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', model)])
    pipeline.fit(X_train, y_train)
    trained_models[name] = pipeline
    print(f"‚úÖ Trained {name}")

# Create ensemble
ensemble = VotingRegressor(estimators=[(name, model) for name, model in trained_models.items()], n_jobs=-1)
ensemble.fit(X_train, y_train)

# Evaluate tabular model
test_score = ensemble.score(X_test, y_test)
y_pred = ensemble.predict(X_test)
cv_scores = cross_val_score(ensemble, X_train, y_train, cv=8, scoring='r2', n_jobs=-1)

if abs(y_skewness) > 1:
    actual_prices = np.expm1(y_test)
    predicted_prices = np.expm1(y_pred)
else:
    actual_prices = y_test
    predicted_prices = y_pred

mae = mean_absolute_error(actual_prices, predicted_prices)

print(f"‚úÖ Tabular ensemble trained. R¬≤ = {test_score:.3f}, MAE = ${mae:.2f}")

# ==============================================================================
# 8. MULTIMODAL MODEL TRAINING
# ==============================================================================

print("üîó Training multimodal model...")

# Get text data
text_data_train = df.loc[X_train.index, 'combined_reviews'].tolist()
text_data_test = df.loc[X_test.index, 'combined_reviews'].tolist()

# Create components
text_encoder = DistilBertTextEncoder(max_length=256, batch_size=8)
meta_learner = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create multimodal model
multimodal_model = ExplainableMultimodalRegressor(
    tabular_model=ensemble,
    text_encoder=text_encoder,
    meta_model=meta_learner
)

# Fit multimodal model
multimodal_model.fit(X_train, text_data_train, y_train)

# Evaluate multimodal model
multimodal_score = multimodal_model.score(X_test, text_data_test, y_test)
multimodal_pred = multimodal_model.predict(X_test, text_data_test)

if abs(y_skewness) > 1:
    multimodal_actual_prices = np.expm1(y_test)
    multimodal_predicted_prices = np.expm1(multimodal_pred)
else:
    multimodal_actual_prices = y_test
    multimodal_predicted_prices = multimodal_pred

multimodal_mae = mean_absolute_error(multimodal_actual_prices, multimodal_predicted_prices)

improvement = ((multimodal_score - test_score) / test_score) * 100
mae_improvement = ((mae - multimodal_mae) / mae) * 100

print(f"‚úÖ Multimodal model trained. R¬≤ = {multimodal_score:.3f}, MAE = ${multimodal_mae:.2f}")
print(f"üöÄ Improvement: R¬≤ +{improvement:.1f}%, MAE +{mae_improvement:.1f}%")

# ==============================================================================
# 9. MODEL EXPORT AND SAVING
# ==============================================================================

print("üíæ Saving models...")

# Save complex models for backup
joblib.dump(ensemble, 'model_artifacts/tabular_model.joblib')
joblib.dump(multimodal_model, 'model_artifacts/multimodal_model.joblib')
joblib.dump(preprocessor, 'model_artifacts/preprocessor.joblib')

# Save clean models for Streamlit
def clean_model_for_export(model):
    """Remove problematic numpy random states"""
    from copy import deepcopy
    model_copy = deepcopy(model)
    
    if hasattr(model_copy, 'random_state'):
        model_copy.random_state = 42
    
    if hasattr(model_copy, 'estimators_'):
        for estimator in model_copy.estimators_:
            if hasattr(estimator, 'random_state'):
                estimator.random_state = 42
                
    if hasattr(model_copy, 'named_steps'):
        for step_name, step in model_copy.named_steps.items():
            if hasattr(step, 'random_state'):
                step.random_state = 42
                
    return model_copy

# Clean and save models
clean_tabular = clean_model_for_export(ensemble)
clean_multimodal = clean_model_for_export(multimodal_model)
clean_multimodal.explainer = None  # Remove explainer for compatibility

with open('tabular_model_clean.pkl', 'wb') as f:
    pickle.dump(clean_tabular, f, protocol=4)

with open('multimodal_model_clean.pkl', 'wb') as f:
    pickle.dump(clean_multimodal, f, protocol=4)

with open('preprocessor_clean.pkl', 'wb') as f:
    pickle.dump(preprocessor, f, protocol=4)

# Create metadata
metadata = {
    'feature_names': X_train.columns.tolist(),
    'categorical_features': categorical_features,
    'numerical_features': numerical_cols.tolist(),
    'y_skewness': y_skewness,
    'price_stats': {
        'mean': df['price_clean'].mean(),
        'std': df['price_clean'].std(),
        'min': df['price_clean'].min(),
        'max': df['price_clean'].max()
    }
}

with open('metadata_clean.pkl', 'wb') as f:
    pickle.dump(metadata, f, protocol=4)

# ==============================================================================
# 10. CREATE JSON MODELS FOR STREAMLIT
# ==============================================================================

print("üìÑ Creating JSON models for Streamlit...")

# Prepare clean numerical data
X_clean = X_train.copy()
y_clean = y_train.copy()

# Get only numerical features to avoid categorical encoding issues
numerical_features_only = X_clean.select_dtypes(include=[np.number]).columns.tolist()
X_numerical = X_clean[numerical_features_only].copy()
X_numerical = X_numerical.fillna(X_numerical.median())

# Train simple models for JSON export
simple_rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
simple_lr = LinearRegression()
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_numerical)
simple_rf.fit(X_numerical, y_clean)
simple_lr.fit(X_scaled, y_clean)

# Test performance
X_test_numerical = X_test[numerical_features_only].fillna(X_test[numerical_features_only].median())
X_test_scaled = scaler.transform(X_test_numerical)

rf_score = simple_rf.score(X_test_numerical, y_test)
lr_score = simple_lr.score(X_test_scaled, y_test)

# Export RandomForest model
rf_export = {
    'model_type': 'RandomForestRegressor',
    'n_estimators': int(simple_rf.n_estimators),
    'feature_names': numerical_features_only,
    'feature_count': len(numerical_features_only),
    'n_features_in_': int(simple_rf.n_features_in_),
    'n_outputs_': int(simple_rf.n_outputs_),
    'performance': {
        'r2_score': float(rf_score),
        'training_samples': len(X_numerical)
    },
    'feature_statistics': convert_to_json_serializable({
        'mean': X_numerical.mean().to_dict(),
        'std': X_numerical.std().to_dict(),
        'min': X_numerical.min().to_dict(),
        'max': X_numerical.max().to_dict(),
        'median': X_numerical.median().to_dict()
    }),
    'target_statistics': {
        'mean': float(y_clean.mean()),
        'std': float(y_clean.std()),
        'min': float(y_clean.min()),
        'max': float(y_clean.max()),
        'median': float(y_clean.median())
    },
    'preprocessing': {
        'y_skewness': float(y_skewness),
        'log_transformed': bool(abs(y_skewness) > 1)
    }
}

# Add feature importance
if hasattr(simple_rf, 'feature_importances_'):
    rf_export['feature_importances'] = {k: float(v) for k, v in zip(numerical_features_only, simple_rf.feature_importances_)}

# Export LinearRegression model
lr_export = {
    'model_type': 'LinearRegression',
    'feature_names': numerical_features_only,
    'feature_count': len(numerical_features_only),
    'coefficients': [float(x) for x in simple_lr.coef_],
    'intercept': float(simple_lr.intercept_),
    'performance': {
        'r2_score': float(lr_score),
        'training_samples': len(X_numerical)
    },
    'scaler_params': {
        'mean': [float(x) for x in scaler.mean_],
        'scale': [float(x) for x in scaler.scale_],
        'var': [float(x) for x in scaler.var_]
    },
    'feature_statistics': rf_export['feature_statistics'],
    'target_statistics': rf_export['target_statistics'],
    'preprocessing': rf_export['preprocessing']
}

# Create sample predictions
sample_data = []
for _, row in X_test_numerical.head(5).iterrows():
    sample_data.append({k: float(v) for k, v in row.to_dict().items()})

sample_rf_preds = [float(x) for x in simple_rf.predict(X_test_numerical.head(5))]
sample_lr_preds = [float(x) for x in simple_lr.predict(X_test_scaled[:5])]

# Complete export with both models
complete_export = {
    'models': {
        'random_forest': rf_export,
        'linear_regression': lr_export
    },
    'sample_predictions': {
        'input_data': sample_data,
        'rf_predictions': sample_rf_preds,
        'lr_predictions': sample_lr_preds
    },
    'metadata': {
        'created_at': pd.Timestamp.now().isoformat(),
        'feature_engineering_applied': True,
        'text_data_available': True,
        'original_feature_count': len(X_train.columns),
        'simplified_feature_count': len(numerical_features_only)
    }
}

# Save JSON models
with open('streamlit_simple_model.json', 'w') as f:
    json.dump(rf_export, f, indent=2)

with open('streamlit_linear_model.json', 'w') as f:
    json.dump(lr_export, f, indent=2)

with open('streamlit_complete_model.json', 'w') as f:
    json.dump(complete_export, f, indent=2)

# Save lightweight data for Streamlit
sample_data_for_streamlit = {
    'X_train_sample': X_train.head(100).to_dict('records'),
    'feature_names': X_train.columns.tolist(),
    'categorical_features': categorical_features,
    'numerical_features': numerical_cols.tolist(),
    'preprocessor_fitted': True,
    'y_skewness': y_skewness,
    'price_stats': {
        'mean': df['price_clean'].mean(),
        'std': df['price_clean'].std(),
        'min': df['price_clean'].min(),
        'max': df['price_clean'].max()
    }
}

with open('model_data_for_streamlit.json', 'w') as f:
    json.dump(sample_data_for_streamlit, f, indent=2)

# Save preprocessor separately
with open('preprocessor_simple.pkl', 'wb') as f:
    pickle.dump(preprocessor, f, protocol=4)

# Save model state
model_state = {
    'model_type': 'voting_regressor_with_multimodal',
    'tabular_models': ['RandomForest', 'GradientBoosting', 'ExtraTrees'],
    'meta_learner': 'RandomForest',
    'text_encoder': 'DistilBERT',
    'feature_count': len(X_train.columns),
    'training_samples': len(X_train),
    'performance': {
        'tabular_r2': float(test_score),
        'multimodal_r2': float(multimodal_score)
    }
}

with open('model_state.json', 'w') as f:
    json.dump(model_state, f, indent=2)

print("‚úÖ JSON models created successfully!")

# ==============================================================================
# 11. FINAL SUMMARY
# ==============================================================================

print("\n" + "=" * 60)
print("üéâ TRAINING COMPLETE!")
print("=" * 60)
print(f"üìä TABULAR MODEL PERFORMANCE")
print(f"   R¬≤ Score: {test_score:.3f} ({test_score*100:.1f}% accuracy)")
print(f"   Cross-Validation: {cv_scores.mean():.3f} (¬±{cv_scores.std():.3f})")
print(f"   MAE: ${mae:.2f}")
print()
print(f"üîó MULTIMODAL MODEL PERFORMANCE")
print(f"   R¬≤ Score: {multimodal_score:.3f} ({multimodal_score*100:.1f}% accuracy)")
print(f"   MAE: ${multimodal_mae:.2f}")
print()
print(f"üöÄ IMPROVEMENT")
print(f"   R¬≤ Improvement: +{improvement:.1f}%")
print(f"   MAE Improvement: +{mae_improvement:.1f}%")
print()
print(f"üìÅ FILES CREATED:")
print(f"   ‚úÖ tabular_model_clean.pkl")
print(f"   ‚úÖ multimodal_model_clean.pkl") 
print(f"   ‚úÖ preprocessor_clean.pkl")
print(f"   ‚úÖ metadata_clean.pkl")
print(f"   ‚úÖ streamlit_simple_model.json")
print(f"   ‚úÖ streamlit_linear_model.json")
print(f"   ‚úÖ streamlit_complete_model.json")
print(f"   ‚úÖ model_data_for_streamlit.json")
print(f"   ‚úÖ model_state.json")
print(f"   ‚úÖ preprocessor_simple.pkl")
print("=" * 60)
print("üéØ Ready for Streamlit deployment!")

üöÄ Starting Airbnb Smart Pricing Engine Training
üìÅ Project root: /Users/adityapandey/My Files/Thesis Sri Ganesh/Data Set/7
üìä Data directory: /Users/adityapandey/My Files/Thesis Sri Ganesh/Data Set/7/data
ü§ñ Models directory: /Users/adityapandey/My Files/Thesis Sri Ganesh/Data Set/7/models
üì• Loading data...
‚úÖ Loaded 6481 listings and 293744 reviews
‚úÖ After merging: 6481 listings with review data
üîß Engineering features...
‚úÖ Feature engineering complete. Shape: (5045, 32)
ü§ñ Training ensemble models...
‚úÖ Trained ExtraTreesUltra
‚úÖ Trained GradientBoostingUltra
‚úÖ Trained RandomForestUltra
‚úÖ Tabular ensemble trained. R¬≤ = 0.857, MAE = $28.45
üîó Training multimodal model...
‚úÖ Multimodal model trained. R¬≤ = 0.864, MAE = $26.91
üöÄ Improvement: R¬≤ +0.7%, MAE +5.4%
üíæ Saving models...
üìÑ Creating JSON models for Streamlit...
‚úÖ JSON models created successfully!

üéâ TRAINING COMPLETE!
üìä TABULAR MODEL PERFORMANCE
   R¬≤ Score: 0.857 (85.7% accuracy