Let's break this down into clear steps:

Data Cleaning
Feature Engineering
Exploratory Data Analysis (EDA) with Visualizations
Outlier Detection and Processing
Correlation Analysis
Price Prediction using PyCaret

In [2]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable




In [3]:
!pip install pandas numpy matplotlib seaborn scikit-learn folium pycaret

Defaulting to user installation because normal site-packages is not writeable






In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import re
import warnings
warnings.filterwarnings('ignore')
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
import folium
from folium.plugins import HeatMap
from pycaret.regression import *

# Set the aesthetic style of the plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

# Function to load and display basic information about the dataset
def load_and_info(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nColumn data types:")
    print(df.dtypes)
    print("\nMissing values per column:")
    print(df.isnull().sum())
    return df

# Function to extract numeric values from price
def extract_numeric_price(price):
    if pd.isna(price) or price == 'null' or price == '0':
        return np.nan
    return float(re.sub(r'[^\d.]', '', str(price)))

# Function to clean the dataset
def clean_dataset(df):
    print("\n--- Data Cleaning ---")

    # Create a copy to avoid modifying the original
    df_clean = df.copy()

    # List of columns to drop (URLs, IDs, etc.)
    columns_to_drop = ['URL', 'Host.ID', 'Images']
    df_clean = df_clean.drop(columns=columns_to_drop, errors='ignore')

    # Clean price columns
    if 'Price.Amount' in df_clean.columns:
        print("Cleaning price values...")
        df_clean['Price.Amount'] = df_clean['Price.Amount'].apply(extract_numeric_price)

        # Filter out rows with null or zero prices
        null_prices = df_clean['Price.Amount'].isna().sum()
        zero_prices = (df_clean['Price.Amount'] == 0).sum()
        print(f"Found {null_prices} null prices and {zero_prices} zero prices")

        # Replace 0 prices with NaN
        df_clean.loc[df_clean['Price.Amount'] == 0, 'Price.Amount'] = np.nan

    # Handle Price.CurrencySymbol - convert to categorical or drop if not needed for analysis
    if 'Price.CurrencySymbol' in df_clean.columns:
        # Either convert to category
        df_clean['CurrencyType'] = df_clean['Price.CurrencySymbol'].astype('category')
        # And/or drop the original column
        df_clean = df_clean.drop(columns=['Price.CurrencySymbol'])

    # Convert rating columns to numeric
    rating_columns = [col for col in df_clean.columns if 'Rating' in col]
    for col in rating_columns:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Handle missing values
    print("Handling missing values...")

    # For numeric columns, impute with median
    numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])

    # For categorical columns with high missingness, fill with 'Unknown'
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        missing_pct = df_clean[col].isna().mean()
        null_values = (df_clean[col] == 'null').sum()

        # Replace 'null' strings with np.nan
        if null_values > 0:
            df_clean[col] = df_clean[col].replace('null', np.nan)

        # Fill missing values with 'Unknown'
        df_clean[col] = df_clean[col].fillna('Unknown')

    # Convert coordinates to float
    if 'Coordinates.Latitude' in df_clean.columns and 'Coordinates.Longitud' in df_clean.columns:
        df_clean['Coordinates.Latitude'] = pd.to_numeric(df_clean['Coordinates.Latitude'], errors='coerce')
        df_clean['Coordinates.Longitud'] = pd.to_numeric(df_clean['Coordinates.Longitud'], errors='coerce')

    # Convert 'IsSuperHost' to binary if it exists
    if 'IsSuperHost' in df_clean.columns:
        df_clean['IsSuperHost'] = df_clean['IsSuperHost'].map({'True': 1, 'False': 0, True: 1, False: 0})
        df_clean['IsSuperHost'] = df_clean['IsSuperHost'].fillna(0).astype(int)

    # Convert PersonCapacity to int if it exists
    if 'PersonCapacity' in df_clean.columns:
        df_clean['PersonCapacity'] = pd.to_numeric(df_clean['PersonCapacity'], errors='coerce')
        df_clean['PersonCapacity'] = df_clean['PersonCapacity'].fillna(df_clean['PersonCapacity'].median()).astype(int)

    print(f"Cleaned dataset shape: {df_clean.shape}")
    return df_clean

# Function for feature engineering
def feature_engineering(df):
    print("\n--- Feature Engineering ---")

    df_featured = df.copy()

    # Create price bins/categories
    if 'Price.Amount' in df_featured.columns:
        print("Creating price categories...")
        price_bins = [0, 50, 100, 150, 200, 300, 500, 1000, float('inf')]
        price_labels = ['Budget', 'Economy', 'Standard', 'Comfort', 'Premium', 'Luxury', 'Elite', 'Ultra-Luxury']
        df_featured['PriceCategory'] = pd.cut(df_featured['Price.Amount'], bins=price_bins, labels=price_labels)

    # Extract number of amenities if column exists
    if 'Amenities' in df_featured.columns:
        print("Processing amenities...")
        # Count the number of amenities
        df_featured['AmenityCount'] = df_featured['Amenities'].apply(
            lambda x: len(str(x).split(',')) if pd.notna(x) and x != 'Unknown' else 0
        )

        # Flag for popular amenities
        amenities_to_check = ['wifi', 'pool', 'kitchen', 'air', 'conditioning', 'washer', 'dryer']
        for amenity in amenities_to_check:
            df_featured[f'Has_{amenity.capitalize()}'] = df_featured['Amenities'].apply(
                lambda x: 1 if pd.notna(x) and amenity.lower() in str(x).lower() else 0
            )

    # Process RoomType if it exists
    if 'RoomType' in df_featured.columns:
        print("Processing room types...")
        # One-hot encode RoomType
        room_type_dummies = pd.get_dummies(df_featured['RoomType'], prefix='Room')
        df_featured = pd.concat([df_featured, room_type_dummies], axis=1)

    # Create a review score ratio
    rating_cols = [col for col in df_featured.columns if col.startswith('Rating.') and col != 'Rating.ReviewCount']
    if rating_cols and 'Rating.ReviewCount' in df_featured.columns:
        print("Creating review metrics...")
        # Calculate average rating across all rating categories
        df_featured['AvgRating'] = df_featured[rating_cols].mean(axis=1)

        # Create a rating reliability metric based on review count
        df_featured['RatingReliability'] = 1 - (1 / (df_featured['Rating.ReviewCount'] + 1))

        # Create an adjusted rating that takes into account the number of reviews
        df_featured['AdjustedRating'] = df_featured['AvgRating'] * df_featured['RatingReliability']

    # Extract location features if coordinates exist
    if 'Coordinates.Latitude' in df_featured.columns and 'Coordinates.Longitud' in df_featured.columns:
        print("Creating location-based features...")
        # Assuming Marrakesh city center is approximately at these coordinates
        marrakesh_center_lat = 31.6295
        marrakesh_center_lon = -7.9811

        # Calculate distance from city center using Euclidean distance (simplified)
        df_featured['DistanceToCenter'] = np.sqrt(
            (df_featured['Coordinates.Latitude'] - marrakesh_center_lat)**2 +
            (df_featured['Coordinates.Longitud'] - marrakesh_center_lon)**2
        ) * 111  # Approximate conversion to kilometers (1 degree ≈ 111 km)

        # Create distance categories
        distance_bins = [0, 1, 3, 5, 10, float('inf')]
        distance_labels = ['City Center', 'Near Center', 'Mid Range', 'Outer Area', 'Remote']
        df_featured['LocationCategory'] = pd.cut(df_featured['DistanceToCenter'], bins=distance_bins, labels=distance_labels)

    if 'Title' in df_featured.columns:
        print("Processing listing titles...")
        # Extract title length as a feature
        df_featured['TitleLength'] = df_featured['Title'].apply(lambda x: len(str(x)) if pd.notna(x) and x != 'Unknown' else 0)

        # Check for luxury keywords in title
        luxury_keywords = ['luxury', 'premium', 'exclusive', 'villa', 'penthouse', 'elegant']
        df_featured['LuxuryTitle'] = df_featured['Title'].apply(
            lambda x: 1 if pd.notna(x) and any(keyword in str(x).lower() for keyword in luxury_keywords) else 0
        )

    # Extract host experience if possible
    if 'IsSuperHost' in df_featured.columns:
        # Already converted to binary in the cleaning function
        print("Processing host information...")

    print(f"Enhanced dataset shape: {df_featured.shape}")

    # Extract bedrooms, bathrooms and other property features
    if 'Description' in df_featured.columns or 'Title' in df_featured.columns or 'Amenities' in df_featured.columns:
      print("Extracting property features (bedrooms, bathrooms, etc.)...")

      # Function to extract number of bedrooms
      def extract_bedrooms(row):
          # Try to find bedroom information in various text fields
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('Amenities', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          # Look for patterns like "2 bedroom", "2 bed", "two bedroom", etc.
          bedroom_patterns = [
              r'(\d+)\s*(?:bedroom|bedrooms|br|bdrm)',
              r'(\d+)\s*(?:bed\b)',
              r'(one|two|three|four|five|six|seven|eight|nine|ten)\s*(?:bedroom|bedrooms)'

          ]

          for pattern in bedroom_patterns:
              match = re.search(pattern, text_to_search)

              if match:
                  # Convert text numbers to digits if needed
                  if match.group(1).isdigit():
                      return int(match.group(1))
                  else:
                      number_map = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
                                    'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
                      return number_map.get(match.group(1).lower(), 1)

          # Default to 1 if we have person capacity (assume studio/1 bedroom)
          if 'PersonCapacity' in df_featured.columns and row['PersonCapacity'] > 0:
              if row['PersonCapacity'] <= 2:
                  return 1  # Likely studio or 1 bedroom for 1-2 people
              else:
                  # Estimate bedrooms based on capacity (rough assumption)
                  return max(1, min(round(row['PersonCapacity'] / 2), 5))

          return 1  # Default to 1 bedroom if no information found

      # Function to extract number of bathrooms
      def extract_bathrooms(row):
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('Amenities', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          # Look for patterns like "2 bath", "2 bathroom", "private bath", etc.
          bathroom_patterns = [
              r'(\d+(?:\.\d+)?)\s*(?:bathroom|bathrooms|bath|ba)',
              r'(one|two|three|four|five|six|seven|eight|nine|ten)\s*(?:bedroom|bedrooms)'

          ]

          for pattern in bathroom_patterns:
              match = re.search(pattern, text_to_search)
              if match:
                  # Convert text numbers to digits if needed
                  if match.group(1).isdigit() or '.' in match.group(1):
                      return float(match.group(1))
                  else:
                      number_map = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
                      return number_map.get(match.group(1).lower(), 1)

          # Check for private or shared bathroom
          if 'private bath' in text_to_search:
              return 1
          elif 'shared bath' in text_to_search:
              return 0.5

          # Default based on room type or person capacity
          if 'RoomType' in df_featured.columns:
              if 'entire' in str(row.get('RoomType', '')).lower():
                  return 1  # Entire home likely has at least one bathroom

          if 'PersonCapacity' in df_featured.columns:
              if row['PersonCapacity'] >= 4:
                  return min(row['PersonCapacity'] / 4, 3)  # Rough estimate based on capacity

          return 1  # Default to 1 bathroom

      # Function to detect if property has a pool
      def has_pool(row):
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('Amenities', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          pool_keywords = ['pool', 'swimming', 'swim']
          return 1 if any(keyword in text_to_search for keyword in pool_keywords) else 0

      # Function to detect if property has parking
      def has_parking(row):
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('Amenities', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          parking_keywords = ['parking', 'garage', 'car park', 'driveway']
          return 1 if any(keyword in text_to_search for keyword in parking_keywords) else 0

    # Function to detect if property has a view
      def has_view(row):
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          view_keywords = ['view', 'panoramic', 'overlook', 'scenery', 'scenic', 'vista']
          return 1 if any(keyword in text_to_search for keyword in view_keywords) else 0

      # Apply the extraction functions
      df_featured['Bedrooms'] = df_featured.apply(extract_bedrooms, axis=1)
      df_featured['Bathrooms'] = df_featured.apply(extract_bathrooms, axis=1)
      df_featured['HasPool'] = df_featured.apply(has_pool, axis=1)
      df_featured['HasParking'] = df_featured.apply(has_parking, axis=1)
      df_featured['HasView'] = df_featured.apply(has_view, axis=1)

      # Create property size category
      df_featured['PropertySizeCategory'] = pd.cut(
          df_featured['Bedrooms'],
          bins=[0, 1, 2, 3, 5, float('inf')],
          labels=['Studio', 'Small', 'Medium', 'Large', 'Mansion']
      )

      # Bathroom to bedroom ratio (can be a luxury indicator)
      df_featured['BathToBedRatio'] = df_featured['Bathrooms'] / df_featured['Bedrooms']
      # Handle infinity cases
      df_featured['BathToBedRatio'] = df_featured['BathToBedRatio'].fillna(1).replace([np.inf, -np.inf], 1)

      # Calculate price per bedroom (can be useful for comparison)
      df_featured['PricePerBedroom'] = df_featured['Price.Amount'] / df_featured['Bedrooms']

      # Step 1: Fill NaN with Price.Amount
      df_featured['PricePerBedroom'] = df_featured['PricePerBedroom'].fillna(df_featured['Price.Amount'])

      # Step 2: Replace inf and -inf with corresponding Price.Amount
      mask_inf = df_featured['PricePerBedroom'].isin([np.inf, -np.inf])
      df_featured.loc[mask_inf, 'PricePerBedroom'] = df_featured.loc[mask_inf, 'Price.Amount']

      # Extract property type information
      def extract_property_type(row):
          text_to_search = ' '.join([
              str(row.get('Title', '')),
              str(row.get('Description', '')),
              str(row.get('SubDescription.Items', ''))
          ]).lower()

          property_types = {
              'apartment': ['apartment', 'flat', 'condo'],
              'house': ['house', 'home', 'villa'],
              'riad': ['riad', 'ryad'],
              'studio': ['studio'],
              'room': ['room', 'bedroom', 'private room']
          }

          for prop_type, keywords in property_types.items():
              if any(keyword in text_to_search for keyword in keywords):
                  return prop_type

          # Default based on room type
          if 'RoomType' in df_featured.columns:
              if 'entire' in str(row.get('RoomType', '')).lower():
                  return 'apartment'  # default to apartment if entire place
              else:
                  return 'room'

          return 'unknown'

      df_featured['PropertyType'] = df_featured.apply(extract_property_type, axis=1)

      # Create dummy variables for property type
      property_type_dummies = pd.get_dummies(df_featured['PropertyType'], prefix='PropType')
      df_featured = pd.concat([df_featured, property_type_dummies], axis=1)

      # Print summary of extracted features
      print(f"Number of properties by bedroom count:")
      print(df_featured['Bedrooms'].value_counts().sort_index())
      print(f"\nNumber of properties by bathroom count:")
      print(df_featured['Bathrooms'].value_counts().sort_index())
      print(f"\nProperties with pool: {df_featured['HasPool'].sum()}")
      print(f"Properties with parking: {df_featured['HasParking'].sum()}")
      print(f"Properties with view: {df_featured['HasView'].sum()}")
      print(f"\nProperty types distribution:")
      print(df_featured['PropertyType'].value_counts())
    return df_featured

# Function for exploratory data analysis
def perform_eda(df):
    print("\n--- Exploratory Data Analysis ---")

    # Create a figure to display price distribution
    plt.figure(figsize=(12, 8))

    # Plot price distribution
    ax = sns.histplot(data=df, x='Price.Amount', kde=True, bins=30)
    plt.title('Distribution of Listing Prices in Marrakesh', fontsize=16)
    plt.xlabel('Price', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    # Format x-axis to show currency
    formatter = mtick.StrMethodFormatter('${x:,.0f}')
    ax.xaxis.set_major_formatter(formatter)
    plt.tight_layout()
    plt.savefig('price_distribution.png')
    # Display the plot
    plt.show()

    # Calculate and visualize price statistics
    print("\nPrice Statistics:")
    price_stats = df['Price.Amount'].describe()
    print(price_stats)

    # Box plot to identify price outliers
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(x=df['Price.Amount'])
    plt.title('Box Plot of Listing Prices - Outlier Detection', fontsize=16)
    plt.xlabel('Price', fontsize=14)
    formatter = mtick.StrMethodFormatter('${x:,.0f}')
    ax.xaxis.set_major_formatter(formatter)
    plt.tight_layout()
    plt.savefig('price_boxplot_outliers.png')
    # Display the plot
    plt.show()

    # Identify numerical threshold for outliers (IQR method)
    Q1 = df['Price.Amount'].quantile(0.25)
    Q3 = df['Price.Amount'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df['Price.Amount'] < lower_bound) | (df['Price.Amount'] > upper_bound)]
    print(f"\nNumber of outliers identified: {len(outliers)}")
    print(f"Lower bound: ${lower_bound:.2f}, Upper bound: ${upper_bound:.2f}")

    # Distribution of room types if available
    if 'RoomType' in df.columns:
        plt.figure(figsize=(10, 6))
        room_counts = df['RoomType'].value_counts()
        ax = sns.barplot(x=room_counts.index, y=room_counts.values)
        plt.title('Distribution of Room Types', fontsize=16)
        plt.xlabel('Room Type', fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        for i, v in enumerate(room_counts.values):
            ax.text(i, v + 5, str(v), ha='center')
        plt.tight_layout()
        plt.savefig('room_type_distribution.png')
        # Display the plot
        plt.show()

        # Room type vs price
        plt.figure(figsize=(12, 8))
        ax = sns.boxplot(x='RoomType', y='Price.Amount', data=df)
        plt.title('Price by Room Type', fontsize=16)
        plt.xlabel('Room Type', fontsize=14)
        plt.ylabel('Price', fontsize=14)
        plt.xticks(rotation=45)
        formatter = mtick.StrMethodFormatter('${x:,.0f}')
        ax.yaxis.set_major_formatter(formatter)
        plt.tight_layout()
        plt.savefig('price_by_roomtype.png')
        # Display the plot
        plt.show()

    # Person capacity vs price if available
    if 'PersonCapacity' in df.columns:
        plt.figure(figsize=(14, 8))
        ax = sns.boxplot(x='PersonCapacity', y='Price.Amount', data=df)
        plt.title('Price by Person Capacity', fontsize=16)
        plt.xlabel('Person Capacity', fontsize=14)
        plt.ylabel('Price', fontsize=14)
        formatter = mtick.StrMethodFormatter('${x:,.0f}')
        ax.yaxis.set_major_formatter(formatter)
        plt.tight_layout()
        plt.savefig('price_by_capacity.png')
        # Display the plot
        plt.show()

        # Distribution of person capacity
        plt.figure(figsize=(10, 6))
        capacity_counts = df['PersonCapacity'].value_counts().sort_index()
        ax = sns.barplot(x=capacity_counts.index, y=capacity_counts.values)
        plt.title('Distribution of Person Capacity', fontsize=16)
        plt.xlabel('Person Capacity', fontsize=14)
        plt.ylabel('Count', fontsize=14)
        for i, v in enumerate(capacity_counts.values):
            ax.text(i, v + 5, str(v), ha='center')
        plt.tight_layout()
        plt.savefig('capacity_distribution.png')
        # Display the plot
        plt.show()

    # SuperHost impact on price if available
    if 'IsSuperHost' in df.columns:
        plt.figure(figsize=(10, 6))
        ax = sns.boxplot(x='IsSuperHost', y='Price.Amount', data=df)
        plt.title('Price by SuperHost Status', fontsize=16)
        plt.xlabel('Is SuperHost (1=Yes, 0=No)', fontsize=14)
        plt.ylabel('Price', fontsize=14)
        formatter = mtick.StrMethodFormatter('${x:,.0f}')
        ax.yaxis.set_major_formatter(formatter)
        plt.tight_layout()
        plt.savefig('price_by_superhost.png')
        # Display the plot
        plt.show()

    # Correlation heatmap for numeric features
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    plt.figure(figsize=(14, 10))
    correlation = numeric_df.corr()
    mask = np.triu(correlation)
    sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', mask=mask)
    plt.title('Correlation Heatmap', fontsize=16)
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    # Display the plot
    plt.show()


    # Map visualization of listings if coordinates are available
    if all(col in df.columns for col in ['Coordinates.Latitude', 'Coordinates.Longitud', 'Price.Amount']):
        print("Creating map visualization of listings...")
        # Create a sample if the dataset is too large
        sample_size = min(1000, len(df))
        map_df = df.sample(sample_size) if len(df) > sample_size else df

        # Create base map centered on Marrakesh
        m = folium.Map(location=[31.6295, -7.9811], zoom_start=13)

        # Add price heatmap
        heat_data = [[row['Coordinates.Latitude'], row['Coordinates.Longitud'], row['Price.Amount']]
                     for _, row in map_df.iterrows() if not (pd.isna(row['Coordinates.Latitude']) or pd.isna(row['Coordinates.Longitud']))]

        HeatMap(heat_data, radius=15).add_to(m)
        m.save('marrakesh_price_heatmap.html')
        print("Map saved as 'marrakesh_price_heatmap.html'")
      # Visualize bedroom and bathroom distribution if available
    if 'Bedrooms' in df.columns and 'Bathrooms' in df.columns:
      # Create a figure with two subplots
      fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

      # Plot bedroom distribution
      bedroom_counts = df['Bedrooms'].value_counts().sort_index()
      sns.barplot(x=bedroom_counts.index, y=bedroom_counts.values, ax=ax1)
      ax1.set_title('Distribution of Bedrooms', fontsize=14)
      ax1.set_xlabel('Number of Bedrooms', fontsize=12)
      ax1.set_ylabel('Count', fontsize=12)

      # Plot bathroom distribution
      bathroom_counts = df['Bathrooms'].value_counts().sort_index()
      sns.barplot(x=bathroom_counts.index, y=bathroom_counts.values, ax=ax2)
      ax2.set_title('Distribution of Bathrooms', fontsize=14)
      ax2.set_xlabel('Number of Bathrooms', fontsize=12)
      ax2.set_ylabel('Count', fontsize=12)

      plt.tight_layout()
      plt.savefig('bedroom_bathroom_distribution.png')
      plt.show()

      # Plot price by bedroom count
      plt.figure(figsize=(12, 6))
      ax = sns.boxplot(x='Bedrooms', y='Price.Amount', data=df)
      plt.title('Price by Number of Bedrooms', fontsize=16)
      plt.xlabel('Number of Bedrooms', fontsize=14)
      plt.ylabel('Price', fontsize=14)
      formatter = mtick.StrMethodFormatter('${x:,.0f}')
      ax.yaxis.set_major_formatter(formatter)
      plt.tight_layout()
      plt.savefig('price_by_bedrooms.png')
      plt.show()

      # Plot price by bathroom count
      plt.figure(figsize=(12, 6))
      ax = sns.boxplot(x='Bathrooms', y='Price.Amount', data=df)
      plt.title('Price by Number of Bathrooms', fontsize=16)
      plt.xlabel('Number of Bathrooms', fontsize=14)
      plt.ylabel('Price', fontsize=14)
      formatter = mtick.StrMethodFormatter('${x:,.0f}')
      ax.yaxis.set_major_formatter(formatter)
      plt.tight_layout()
      plt.savefig('price_by_bathrooms.png')
      plt.show()

      # Plot property type vs price
      if 'PropertyType' in df.columns:
          plt.figure(figsize=(12, 6))
          ax = sns.boxplot(x='PropertyType', y='Price.Amount', data=df)
          plt.title('Price by Property Type', fontsize=16)
          plt.xlabel('Property Type', fontsize=14)
          plt.ylabel('Price', fontsize=14)
          formatter = mtick.StrMethodFormatter('${x:,.0f}')
          ax.yaxis.set_major_formatter(formatter)
          plt.tight_layout()
          plt.savefig('price_by_property_type.png')
          plt.show()

      # Plot price per bedroom
      plt.figure(figsize=(10, 6))
      sns.histplot(data=df, x='PricePerBedroom', kde=True, bins=30)
      plt.title('Distribution of Price per Bedroom', fontsize=16)
      plt.xlabel('Price per Bedroom', fontsize=14)
      plt.ylabel('Frequency', fontsize=14)
      plt.tight_layout()
      plt.savefig('price_per_bedroom_distribution.png')
      plt.show()

    return outliers, upper_bound

# Function to handle outliers
def process_outliers(df, outliers, upper_bound):
    print("\n--- Processing Outliers ---")

    df_processed = df.copy()

    # Report on outliers
    print(f"Original price range: ${df_processed['Price.Amount'].min():.2f} - ${df_processed['Price.Amount'].max():.2f}")

    # Option 1: Cap the prices at the upper bound
    df_processed['Price.Amount_capped'] = df_processed['Price.Amount'].clip(upper=upper_bound)
    print(f"After capping: ${df_processed['Price.Amount_capped'].min():.2f} - ${df_processed['Price.Amount_capped'].max():.2f}")

    # Option 2: Create a dataset without outliers for modeling
    df_no_outliers = df_processed[df_processed['Price.Amount'] <= upper_bound].copy()
    print(f"Dataset without outliers shape: {df_no_outliers.shape}")

    # Visualize the price distribution after capping
    plt.figure(figsize=(12, 8))
    sns.histplot(data=df_processed, x='Price.Amount_capped', kde=True, bins=30)
    plt.title('Distribution of Listing Prices After Capping Outliers', fontsize=16)
    plt.xlabel('Price (Capped)', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.tight_layout()
    plt.savefig('price_distribution_after_capping.png')
    plt.show()

    # Compare original vs capped price distributions
    plt.figure(figsize=(14, 8))
    plt.subplot(1, 2, 1)
    sns.histplot(data=df_processed, x='Price.Amount', kde=True, bins=30, color='blue')
    plt.title('Original Price Distribution', fontsize=14)
    plt.xlabel('Price', fontsize=12)

    plt.subplot(1, 2, 2)
    sns.histplot(data=df_processed, x='Price.Amount_capped', kde=True, bins=30, color='green')
    plt.title('Capped Price Distribution', fontsize=14)
    plt.xlabel('Price (Capped)', fontsize=12)

    plt.tight_layout()
    plt.savefig('price_distribution_comparison.png')
    plt.show()

    return df_processed, df_no_outliers

# Function to prepare data for modeling
def prepare_for_modeling(df):
    print("\n--- Preparing Data for Modeling ---")

    # Create a copy to work with
    model_df = df.copy()

    # Select features for modeling
    # Start with numeric columns
    numeric_features = model_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Remove the target variable and any duplicates
    if 'Price.Amount' in numeric_features:
        numeric_features.remove('Price.Amount')
    if 'Price.Amount_capped' in numeric_features:
        numeric_features.remove('Price.Amount_capped')

    # Select categorical columns that could be useful
    categorical_cols = ['RoomType', 'HomeTier']
    categorical_features = [col for col in categorical_cols if col in model_df.columns]

    # Create dummy variables for categorical features
    model_df_encoded = pd.get_dummies(model_df, columns=categorical_features, drop_first=True)

    # Select all features excluding the target and any non-predictive columns
    non_predictive = ['Price.Amount', 'Price.Amount_capped', 'RoomID', 'Title', 'URL', 'Host.ID', 'Host.Name',
                 'Description', 'PriceCategory', 'Amenities', 'HouseRules.Aditional', 'HouseRules.General',
                 'LocationDescriptions', 'SubDescription.Title', 'SubDescription.Items', 'Highlights',
                 'CurrencyType', 'CoHosts', 'PropertyType']

    all_features = [col for col in model_df_encoded.columns if col not in non_predictive]

    # Define target variable - using capped price to reduce impact of outliers
    if 'Price.Amount_capped' in model_df_encoded.columns:
        target = 'Price.Amount_capped'
    else:
        target = 'Price.Amount'

    # Split the data
    X = model_df_encoded[all_features]
    y = model_df_encoded[target]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test, all_features, target

# Function to build and evaluate models using PyCaret
def build_models_pycaret(df, target_col):
    print("\n--- Building Models with PyCaret ---")

    # Create a clean dataframe for modeling
    model_df = df.copy()

    # Remove object/categorical columns that won't be useful for modeling
    cols_to_exclude = [
        'RoomID', 'Title', 'URL', 'Host.ID', 'Host.Name', 'Description',
        'Amenities', 'HouseRules.Aditional', 'HouseRules.General',
        'LocationDescriptions', 'SubDescription.Title', 'SubDescription.Items',
        'Highlights', 'CoHosts'
    ]

    # Remove columns that exist in the dataframe
    for col in cols_to_exclude:
        if col in model_df.columns:
            model_df = model_df.drop(columns=[col])

    # Make sure we don't have any object columns left that might cause errors
    object_cols = model_df.select_dtypes(include=['object']).columns
    # Convert any remaining object columns to category
    for col in object_cols:
        model_df[col] = model_df[col].astype('category')

    # Double check we have the target column
    if target_col not in model_df.columns:
        print(f"Error: Target column '{target_col}' not found in the dataframe")
        return None

    # Print the datatypes to confirm no object types remain
    print("Column data types after preparation:")
    print(model_df.dtypes)

    # Initialize PyCaret setup
    print("Setting up PyCaret environment...")
    setup_success = False
    try:
        s = setup(
            data=model_df,
            target=target_col,
            normalize=True,
            transformation=True,
            session_id=42,
            verbose=False
        )
        setup_success = True
    except Exception as e:
        print(f"Error in PyCaret setup: {e}")
        print("Trying alternative approach...")

        # If first approach fails, try with a simpler dataset
        numeric_cols = model_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        if target_col in numeric_cols:
            numeric_cols.remove(target_col)

        simple_df = model_df[[target_col] + numeric_cols].copy()
        try:
            s = setup(
                data=simple_df,
                target=target_col,
                normalize=True,
                transformation=True,
                session_id=42,
                verbose=False
            )
            setup_success = True
        except Exception as e:
            print(f"Second attempt also failed: {e}")

    if not setup_success:
        print("PyCaret setup failed. Trying fallback approach with sklearn...")
        # Fallback to a simple sklearn model
        return build_fallback_model(model_df, target_col)

    # Compare models
    print("Comparing different regression models...")
    best_models = compare_models(sort='RMSE', n_select=3)

    # Print best model results
    print("\nTop 3 Models:")
    for i, model in enumerate(best_models, 1):
        print(f"Model {i}: {model}")

    # Create the best model
    best_model = best_models[0]
    print(f"\nTuning the best model: {best_model}")
    tuned_model = tune_model(best_model)

    # Plot feature importance
    try:
        plot_model(tuned_model, plot='feature', save=True)
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Could not plot feature importance: {e}")

    # Finalize the model
    final_model = finalize_model(tuned_model)

    # Save the model
    save_model(final_model, 'airbnb_price_predictor')

    print("Model saved as 'airbnb_price_predictor'")

    # Return the final model
    return final_model

# Fallback function using sklearn if PyCaret fails
def build_fallback_model(df, target_col):
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score

    print("\n--- Building Models with Sklearn (Fallback) ---")

    # Prepare data
    X = df.drop(columns=[target_col]).select_dtypes(include=['float64', 'int64'])
    y = df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Try different models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42)
    }

    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        results[name] = {'model': model, 'rmse': rmse, 'r2': r2}
        print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.4f}")

    # Find the best model
    best_model_name = min(results, key=lambda x: results[x]['rmse'])
    best_model = results[best_model_name]['model']
    print(f"\nBest model: {best_model_name}")

    # Feature importance for tree-based models
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        indices = np.argsort(importances)[::-1]
        features = X.columns

        plt.figure(figsize=(12, 8))
        plt.title('Feature Importances')
        plt.barh(range(len(indices)), importances[indices], align='center')
        plt.yticks(range(len(indices)), [features[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.show()

    # Save the model using pickle
    import pickle
    with open('airbnb_price_predictor.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    print("Model saved as 'airbnb_price_predictor.pkl'")
    return best_model

# Main function
def main(file_path):
    # Load the data
    df = load_and_info(file_path)

    # Clean the data
    df_clean = clean_dataset(df)

    # Feature engineering
    df_featured = feature_engineering(df_clean)

    # Save the cleaned and featured dataset
    df_featured.to_csv('marrakesh_airbnb_cleaned.csv', index=False)
    print("Saved cleaned dataset to 'marrakesh_airbnb_cleaned.csv'")

    # Perform EDA and identify outliers
    outliers, upper_bound = perform_eda(df_featured)

    # Process outliers
    df_processed, df_no_outliers = process_outliers(df_featured, outliers, upper_bound)

    # Save dataset without outliers
    df_no_outliers.to_csv('marrakesh_airbnb_no_outliers.csv', index=False)
    print("Saved dataset without outliers to 'marrakesh_airbnb_no_outliers.csv")

    # Prepare data for modeling
    X_train, X_test, y_train, y_test, features, target = prepare_for_modeling(df_no_outliers)

    # For PyCaret modeling - let's use the no outliers dataset
    categorical_features = ['RoomType', 'HomeTier'] if all(col in df_no_outliers.columns for col in ['RoomType', 'HomeTier']) else None

    # Build models with PyCaret
    final_model = build_models_pycaret(df_no_outliers, target_col=target)

    print("\nProject completed successfully!")
    print("Files generated:")
    print("1. marrakesh_airbnb_cleaned.csv - Cleaned dataset")
    print("2. marrakesh_airbnb_no_outliers.csv - Dataset with outliers removed")
    print("3. Various visualization PNG files")
    print("4. marrakesh_price_heatmap.html - Geographic visualization")
    print("5. airbnb_price_predictor - Saved model file")

# Execute the main function
# Assuming the file is named 'marrakesh_airbnb_data.csv'
if __name__ == "__main__":
    file_path = '/content/rooms_details.csv'
    main(file_path)

ModuleNotFoundError: No module named 'pandas'