In [None]:


import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import numpy as np
import geopandas as gpd

def ml_optimized_foot_traffic_processor():
    """
    ML-OPTIMIZED VERSION: Global normalization for time series prediction
    INCLUDES: Taxi foot traffic + subway scores combination (65%/35% weighting)
    
    Key insight: ALL data (all years, all periods) normalized together
    This preserves temporal relationships crucial for ML learning
    """
    
    print("🤖 ML-Optimized Foot Traffic Processor")
    print("🎯 Goal: 0-10 scale (×10 on frontend) + proper time series ML")
    print("🚇 Includes: Taxi (65%) + Subway (35%) combination")
    print("📅 Data: 2020-2023 (4 years for robust ML training)")
    print("=" * 70)
    
    files = {
        2020: 'YellowTaxiYears/2020_Yellow_Taxi_Trip_Data.csv',
        2021: 'YellowTaxiYears/2021_Yellow_Taxi_Trip_Data.csv', 
        2022: 'YellowTaxiYears/2022_Yellow_Taxi_Trip_Data.csv',
        2023: 'YellowTaxiYears/2023_Yellow_Taxi_Trip_Data.csv'
    }
    
    manhattan_zones = [4, 12, 13, 14, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 107, 113, 114, 116, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 230, 231, 232, 233, 234]
    
    # STEP 1: Collect ALL raw counts (every year, every period, every zone)
    print("📊 STEP 1: Collecting ALL raw trip counts...")
    
    all_raw_pickup_counts = []
    all_raw_dropoff_counts = []
    raw_data_store = {}
    
    for year, file in files.items():
        print(f"   📅 Processing {year}...", end=" ")
        
        # Load and filter data
        df = pd.read_csv(file, usecols=['tpep_pickup_datetime', 'PULocationID', 'DOLocationID'])
        df['hour'] = pd.to_datetime(df['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.hour
        df = df.dropna(subset=['hour'])
        df = df[df['PULocationID'].isin(manhattan_zones) & df['DOLocationID'].isin(manhattan_zones)]
        
        # Define time periods
        periods = {
            'morning': (df['hour'] >= 6) & (df['hour'] < 12),
            'afternoon': (df['hour'] >= 12) & (df['hour'] < 18),
            'evening': (df['hour'] >= 18) & (df['hour'] < 24),
            'night': (df['hour'] >= 0) & (df['hour'] < 6)
        }
        
        year_pickup_counts = []
        year_dropoff_counts = []
        
        # Process each time period
        for period_name, period_mask in periods.items():
            period_df = df[period_mask]
            
            # Get raw counts
            pickups = period_df.groupby('PULocationID').size().reindex(manhattan_zones, fill_value=0).values
            dropoffs = period_df.groupby('DOLocationID').size().reindex(manhattan_zones, fill_value=0).values
            
            # Store for later use
            raw_data_store[f'{period_name}_{year}_pickup'] = pickups
            raw_data_store[f'{period_name}_{year}_dropoff'] = dropoffs
            
            # Add to global collection
            all_raw_pickup_counts.extend(pickups)
            all_raw_dropoff_counts.extend(dropoffs)
            year_pickup_counts.extend(pickups)
            year_dropoff_counts.extend(dropoffs)
        
        # Overall year counts
        all_pickups = df.groupby('PULocationID').size().reindex(manhattan_zones, fill_value=0).values
        all_dropoffs = df.groupby('DOLocationID').size().reindex(manhattan_zones, fill_value=0).values
        
        raw_data_store[f'average_{year}_pickup'] = all_pickups
        raw_data_store[f'average_{year}_dropoff'] = all_dropoffs
        
        all_raw_pickup_counts.extend(all_pickups)
        all_raw_dropoff_counts.extend(all_dropoffs)
        
        print(f"✅ {len(df):,} trips, {len(year_pickup_counts)} data points")
    
    # STEP 2: Global normalization strategy
    print(f"\n🔄 STEP 2: GLOBAL normalization across all data...")
    
    print(f"   📈 Raw data stats:")
    pickup_array = np.array(all_raw_pickup_counts)
    dropoff_array = np.array(all_raw_dropoff_counts)
    
    print(f"      Pickups: min={pickup_array.min()}, max={pickup_array.max()}, mean={pickup_array.mean():.1f}")
    print(f"      Dropoffs: min={dropoff_array.min()}, max={dropoff_array.max()}, mean={dropoff_array.mean():.1f}")
    
    # Strategy: MinMaxScaler to 0-10 (frontend will multiply by 10 for display)
    print(f"   🎯 Applying global 0-10 normalization...")
    
    pickup_scaler = MinMaxScaler(feature_range=(0, 10))
    dropoff_scaler = MinMaxScaler(feature_range=(0, 10))
    
    # Fit on ALL data
    pickup_scaler.fit(pickup_array.reshape(-1, 1))
    dropoff_scaler.fit(dropoff_array.reshape(-1, 1))
    
    # STEP 3: Apply normalization and create foot traffic scores
    print(f"\n⚡ STEP 3: Creating foot traffic scores...")
    
    results = {'id': range(1, len(manhattan_zones) + 1), 'GEOID': manhattan_zones}
    
    years = list(files.keys())
    periods = ['morning', 'afternoon', 'evening', 'night']
    
    # Process each year and period
    for year in years:
        for period in periods:
            # Get raw counts
            pickups = raw_data_store[f'{period}_{year}_pickup']
            dropoffs = raw_data_store[f'{period}_{year}_dropoff']
            
            # Apply GLOBAL normalization
            pickup_scaled = pickup_scaler.transform(pickups.reshape(-1, 1)).flatten()
            dropoff_scaled = dropoff_scaler.transform(dropoffs.reshape(-1, 1)).flatten()
            
            # Calculate foot traffic score (0.7 dropoff + 0.3 pickup)
            foot_traffic_score = 0.7 * dropoff_scaled + 0.3 * pickup_scaled
            results[f'{period}_{year}'] = foot_traffic_score
        
        # Process average
        pickups = raw_data_store[f'average_{year}_pickup']
        dropoffs = raw_data_store[f'average_{year}_dropoff']
        
        pickup_scaled = pickup_scaler.transform(pickups.reshape(-1, 1)).flatten()
        dropoff_scaled = dropoff_scaler.transform(dropoffs.reshape(-1, 1)).flatten()
        
        avg_score = 0.7 * dropoff_scaled + 0.3 * pickup_scaled
        results[f'average_{year}'] = avg_score
    
    # STEP 4: Spatial mapping from taxi zones to census tracts
    print(f"\n🗺️  STEP 4: Spatial mapping taxi zones → census tracts...")
    
    try:
        # Load taxi zones shapefile - checking multiple possible locations
        taxi_zones_paths = [
            "taxi_zones/taxi_zones.shp",  # Original path
            "../taxi_zones/taxi_zones.shp",  # One level up
            "../../taxi_zones/taxi_zones.shp"  # Two levels up
        ]
        
        taxi_zones = None
        for path in taxi_zones_paths:
            try:
                taxi_zones = gpd.read_file(path)
                taxi_zones = taxi_zones[taxi_zones["borough"] == "Manhattan"].copy()
                print(f"   ✅ Loaded {len(taxi_zones)} Manhattan taxi zones from {path}")
                break
            except:
                continue
        
        if taxi_zones is None:
            raise FileNotFoundError("Taxi zones shapefile not found in any expected location")
        
        # Load census tracts (your exact structure)
        census_tracts = gpd.read_file("../census tract geofiles/manhattan_census_tracts.geojson")
        census_tracts = census_tracts[["GEOID", "geometry"]].to_crs(taxi_zones.crs)
        print(f"   ✅ Loaded {len(census_tracts)} Manhattan census tracts")
        
        # Create mapping from taxi zones to census tracts using spatial overlays
        print(f"   🔄 Computing spatial overlaps...")
        
        # Method: Use spatial intersection to find overlaps
        overlaps = gpd.overlay(taxi_zones, census_tracts, how='intersection')
        overlaps['overlap_area'] = overlaps.geometry.area
        
        # For each taxi zone, find which census tracts it overlaps with
        zone_to_tract_mapping = []
        
        for location_id in manhattan_zones:
            zone_overlaps = overlaps[overlaps['LocationID'] == location_id]
            
            if len(zone_overlaps) > 0:
                # Get the tract(s) with the largest overlap area
                total_area = zone_overlaps['overlap_area'].sum()
                
                for _, overlap in zone_overlaps.iterrows():
                    # Weight by overlap area
                    weight = overlap['overlap_area'] / total_area if total_area > 0 else 1.0
                    zone_to_tract_mapping.append({
                        'LocationID': location_id,
                        'GEOID': overlap['GEOID'],
                        'weight': weight
                    })
            else:
                # Fallback: use nearest tract
                taxi_zone = taxi_zones[taxi_zones['LocationID'] == location_id]
                if len(taxi_zone) > 0:
                    zone_centroid = taxi_zone.geometry.centroid.iloc[0]
                    distances = census_tracts.geometry.distance(zone_centroid)
                    nearest_geoid = census_tracts.iloc[distances.idxmin()]['GEOID']
                    
                    zone_to_tract_mapping.append({
                        'LocationID': location_id,
                        'GEOID': nearest_geoid,
                        'weight': 1.0
                    })
        
        mapping_df = pd.DataFrame(zone_to_tract_mapping)
        print(f"   ✅ Created {len(mapping_df)} zone→tract mappings")
        
        # Apply spatial mapping to create tract-level scores
        print(f"   🔄 Aggregating taxi scores by census tract...")
        
        tract_results = {'GEOID': []}
        years = list(files.keys())
        
        # Get all unique GEOIDs
        unique_geoids = mapping_df['GEOID'].unique()
        tract_results['GEOID'] = unique_geoids
        tract_results['id'] = range(1, len(unique_geoids) + 1)
        
        # For each time period and year, aggregate scores by tract
        periods = ['morning', 'afternoon', 'evening', 'night', 'average']
        
        for period in periods:
            for year in years:
                col_name = f'{period}_{year}'
                tract_scores = []
                
                for geoid in unique_geoids:
                    # Get all taxi zones that map to this tract
                    zone_mappings = mapping_df[mapping_df['GEOID'] == geoid]
                    
                    # Calculate weighted average of taxi scores
                    weighted_score = 0
                    total_weight = 0
                    
                    for _, mapping in zone_mappings.iterrows():
                        location_id = mapping['LocationID']
                        weight = mapping['weight']
                        
                        # Find the score for this LocationID
                        zone_idx = manhattan_zones.index(location_id)
                        zone_score = results[col_name][zone_idx]
                        
                        weighted_score += zone_score * weight
                        total_weight += weight
                    
                    # Final score for this tract
                    final_score = weighted_score / total_weight if total_weight > 0 else 0
                    tract_scores.append(final_score)
                
                tract_results[col_name] = tract_scores
        
        # Replace results with tract-based data
        final_df = pd.DataFrame(tract_results)
        print(f"   ✅ Created tract-based foot traffic scores: {len(final_df)} census tracts")
        
    except FileNotFoundError as e:
        print(f"   ⚠️  Spatial files not found: {e}")
        print(f"   📝 Using simple LocationID→GEOID mapping instead...")
        
        # Fallback: Simple mapping like your project  
        try:
            import json
            # Use your exact file structure
            with open('../census tract geofiles/manhattan_census_tracts.geojson', 'r') as f:
                geojson = json.load(f)
            geoids = [feature['properties']['GEOID'] for feature in geojson['features']]
            
            # Simple distribution approach (like your project)
            result_rows = []
            locations = sorted(manhattan_zones)
            tracts_per_location = len(geoids) // len(locations)
            
            print(f"   📊 Mapping {len(locations)} LocationIDs to {len(geoids)} GEOIDs")
            print(f"   📈 Approximately {tracts_per_location} census tracts per LocationID")
            
            geoid_index = 0
            for i, location_id in enumerate(locations):
                num_geoids = tracts_per_location + (1 if i < len(geoids) % len(locations) else 0)
                location_idx = manhattan_zones.index(location_id)
                
                for j in range(num_geoids):
                    if geoid_index < len(geoids):
                        row = {'GEOID': geoids[geoid_index], 'id': geoid_index + 1}
                        
                        # Copy all scores from this LocationID
                        for col, values in results.items():
                            if col not in ['id', 'GEOID']:
                                row[col] = values[location_idx]
                        
                        result_rows.append(row)
                        geoid_index += 1
            
            final_df = pd.DataFrame(result_rows)
            print(f"   ✅ Created simple mapping: {len(final_df)} census tracts")
            
        except Exception as e2:
            print(f"   ❌ Fallback mapping failed: {e2}")
            print(f"   📝 Keeping taxi LocationIDs as GEOID")
            final_df = pd.DataFrame(results)
    
    except Exception as e:
        print(f"   ❌ Spatial mapping error: {e}")
        print(f"   📝 Keeping taxi LocationIDs as GEOID")
        final_df = pd.DataFrame(results)
    
    # STEP 5: Load subway scores and create combined scores
    print(f"\n🚇 STEP 5: Loading subway scores and creating combined scores...")
    
    try:
        # Load subway scores by census tract
        subway_scores = pd.read_csv('subway_score_by_tract.csv')
        print(f"   ✅ Loaded subway scores: {len(subway_scores)} census tracts")
        
        # Create zone-to-tract mapping (simplified - you may need a proper mapping file)
        # For now, assume each taxi zone maps to a census tract with similar ID patterns
        # You can replace this with actual spatial mapping if needed
        final_df['GEOID'] = final_df['GEOID'].astype(str)
        subway_scores['GEOID'] = subway_scores['GEOID'].astype(str)
        
        # Merge subway scores
        final_df = final_df.merge(subway_scores, on='GEOID', how='left')
        final_df['subway_score'] = final_df['subway_score'].fillna(0)
        
        # Create combined scores for each time period and year
        score_columns = [col for col in final_df.columns 
                        if col.endswith(tuple(str(y) for y in years)) 
                        and col != 'subway_score']
        
        for col in score_columns:
            combined_col = col.replace('_', '_combined_')
            final_df[combined_col] = (
                0.65 * final_df[col] + 
                0.35 * final_df['subway_score']
            ).round(3)
        
        print(f"   🔄 Created {len(score_columns)} combined scores: taxi (65%) + subway (35%)")
        print(f"   📊 Combined score range: {final_df[score_columns[0].replace('_', '_combined_')].min():.2f} - {final_df[score_columns[0].replace('_', '_combined_')].max():.2f}")
            
    except FileNotFoundError:
        print(f"   ⚠️  subway_score_by_tract.csv not found - using taxi scores only")
        print(f"   📝 Create subway scores first: run MTA subway processing")
    except Exception as e:
        print(f"   ⚠️  Error loading subway scores: {e}")
        print(f"   📝 Continuing with taxi-only scores")
    
    # STEP 6: Save and analyze results
    final_df = pd.DataFrame(results)
    final_df.to_csv('foot_traffic_ml_optimized.csv', index=False)
    
    print(f"\n💾 SAVED: foot_traffic_ml_optimized.csv")
    print(f"📊 {len(final_df)} zones × {len(final_df.columns)} columns")
    
    # STEP 7: Analyze for ML readiness
    print(f"\n🤖 ML READINESS ANALYSIS:")
    
    # Analyze taxi-only scores
    taxi_score_cols = [col for col in final_df.columns 
                      if col.endswith(tuple(str(y) for y in years)) 
                      and 'combined' not in col 
                      and col != 'subway_score']
    
    if len(taxi_score_cols) > 0:
        taxi_scores = final_df[taxi_score_cols].values.flatten()
        print(f"   🚕 Taxi scores: {taxi_scores.min():.1f} - {taxi_scores.max():.1f} (mean: {taxi_scores.mean():.1f})")
    
    # Analyze combined scores if available
    combined_score_cols = [col for col in final_df.columns if 'combined' in col]
    if len(combined_score_cols) > 0:
        combined_scores = final_df[combined_score_cols].values.flatten()
        print(f"   🚇 Combined scores: {combined_scores.min():.1f} - {combined_scores.max():.1f} (mean: {combined_scores.mean():.1f})")
        
        # Show the effect of subway combination
        if len(taxi_score_cols) > 0:
            print(f"   📈 Subway effect: Mean combined vs taxi = {combined_scores.mean():.1f} vs {taxi_scores.mean():.1f}")
    
    # Check temporal patterns (this is what was broken before!)
    print(f"\n⏰ TEMPORAL PATTERN EXAMPLE (Zone 161 - highest activity):")
    zone_161 = final_df[final_df['GEOID'] == 161].iloc[0]
    
    for period in ['morning', 'afternoon', 'evening']:
        # Show taxi scores
        taxi_cols = [f'{period}_{year}' for year in years if f'{period}_{year}' in final_df.columns]
        if taxi_cols:
            taxi_scores = [zone_161[col] for col in taxi_cols]
            frontend_scores = [s * 10 for s in taxi_scores]  # What frontend will show
            trend = "↗️" if taxi_scores[-1] > taxi_scores[0] else "↘️" if taxi_scores[-1] < taxi_scores[0] else "➡️"
            print(f"   🚕 {period:10}: {' → '.join([f'{s:.1f}' for s in taxi_scores])} (frontend: {' → '.join([f'{s:.0f}' for s in frontend_scores])}) {trend}")
        
        # Show combined scores if available
        combined_cols = [f'{period}_combined_{year}' for year in years if f'{period}_combined_{year}' in final_df.columns]
        if combined_cols:
            combined_scores = [zone_161[col] for col in combined_cols]
            frontend_combined = [s * 10 for s in combined_scores]
            trend = "↗️" if combined_scores[-1] > combined_scores[0] else "↘️" if combined_scores[-1] < combined_scores[0] else "➡️"
            print(f"   🚇 {period:10}: {' → '.join([f'{s:.1f}' for s in combined_scores])} (frontend: {' → '.join([f'{s:.0f}' for s in frontend_combined])}) {trend}")
    
    print(f"   ↑ These trends are now MEANINGFUL for ML!")
    
    # Show zones suitable for different activity levels
    print(f"\n🎯 ZONE ACTIVITY DISTRIBUTION:")
    avg_cols = [f'average_{year}' for year in years]
    final_df['overall_avg'] = final_df[avg_cols].mean(axis=1)
    
    activity_levels = {
        'Very High (8-10)': (final_df['overall_avg'] >= 8).sum(),
        'High (6-8)': ((final_df['overall_avg'] >= 6) & (final_df['overall_avg'] < 8)).sum(),
        'Medium (4-6)': ((final_df['overall_avg'] >= 4) & (final_df['overall_avg'] < 6)).sum(),
        'Low (2-4)': ((final_df['overall_avg'] >= 2) & (final_df['overall_avg'] < 4)).sum(),
        'Very Low (0-2)': (final_df['overall_avg'] < 2).sum()
    }
    
    for level, count in activity_levels.items():
        print(f"   {level}: {count} zones")
    
    # ML-specific recommendations
    print(f"\n✅ ML OPTIMIZATION RESULTS:")
    print(f"   🎯 Global normalization preserves temporal relationships")
    print(f"   📈 Year-over-year trends are now meaningful") 
    print(f"   ⏰ Time period comparisons work across years")
    print(f"   🔢 0-10 scale (frontend ×10 = your chart values)")
    print(f"   🚇 Combined taxi (65%) + subway (35%) scores when available")
    print(f"   📅 4-year dataset (2020-2023) for robust trend learning")
    print(f"   🤖 Ready for time series ML models")
    
    return final_df

def create_trend_features_for_ml(df):
    """
    Add ML-specific trend features to the optimized dataset
    """
    
    print(f"\n🔧 ADDING ML TREND FEATURES...")
    
    years = [2020, 2021, 2022, 2023]
    periods = ['morning', 'afternoon', 'evening', 'night', 'average']
    
    # Add trend features
    for period in periods:
        period_cols = [f'{period}_{year}' for year in years if f'{period}_{year}' in df.columns]
        
        if len(period_cols) >= 2:
            # Linear trend (slope)
            trends = []
            for idx, row in df.iterrows():
                values = [row[col] for col in period_cols]
                x = np.array(range(len(values)))
                trend = np.polyfit(x, values, 1)[0] if len(values) >= 2 else 0
                trends.append(trend)
            
            df[f'{period}_trend_slope'] = trends
            
            # Year-over-year growth rate
            if len(period_cols) >= 2:
                df[f'{period}_growth_rate'] = ((df[period_cols[-1]] - df[period_cols[0]]) / (df[period_cols[0]] + 1)) * 100
    
    # Add seasonal patterns
    df['prefers_morning'] = df[[f'morning_{y}' for y in years]].mean(axis=1)
    df['prefers_afternoon'] = df[[f'afternoon_{y}' for y in years]].mean(axis=1)
    df['prefers_evening'] = df[[f'evening_{y}' for y in years]].mean(axis=1)
    df['prefers_night'] = df[[f'night_{y}' for y in years]].mean(axis=1)
    
    # Find peak period for each zone
    time_cols = ['prefers_morning', 'prefers_afternoon', 'prefers_evening', 'prefers_night']
    df['peak_period'] = df[time_cols].idxmax(axis=1).str.replace('prefers_', '')
    
    # Save enhanced version
    df.to_csv('foot_traffic_ml_ready.csv', index=False)
    
    print(f"   ✅ Added trend slopes, growth rates, seasonal preferences (4-year data)")
    print(f"   💾 Saved: foot_traffic_ml_ready.csv")
    
    return df

# Main execution
if __name__ == "__main__":
    print("🚀 RUNNING ML-OPTIMIZED PROCESSOR")
    print("Goal: 0-10 decimal scores + taxi/subway combination + ML-ready temporal patterns")
    print("Data: 2020-2023 (4 years) | Combination: 65% taxi + 35% subway")
    print("📂 Run from: foot_traffic_score/ directory")
    print()
    
    # Process with global normalization
    optimized_df = ml_optimized_foot_traffic_processor()
    
    # Add ML features
    ml_ready_df = create_trend_features_for_ml(optimized_df)
    
    print(f"\n🎉 COMPLETE!")
    print(f"✅ foot_traffic_ml_optimized.csv - All scores (taxi + combined)")
    print(f"✅ foot_traffic_taxi_only.csv - Taxi scores only") 
    print(f"✅ foot_traffic_combined_only.csv - Combined scores only (if subway available)")
    print(f"✅ foot_traffic_ml_ready.csv - Enhanced with ML features") 
    print(f"📂 All files saved to: foot_traffic_score/ directory")
    print(f"📊 Frontend: multiply by 10 for display (9.3 → 93)")
    print(f"🚇 Combination: 65% taxi + 35% subway (following your project pattern)")
    print(f"🤖 Optimized for time series ML models")

🚀 RUNNING ML-OPTIMIZED PROCESSOR
Goal: 0-10 decimal scores + taxi/subway combination + ML-ready temporal patterns
Data: 2020-2023 (4 years) | Combination: 65% taxi + 35% subway
📂 Run from: foot_traffic_score/ directory

🤖 ML-Optimized Foot Traffic Processor
🎯 Goal: 0-10 scale (×10 on frontend) + proper time series ML
🚇 Includes: Taxi (65%) + Subway (35%) combination
📅 Data: 2020-2023 (4 years for robust ML training)
📊 STEP 1: Collecting ALL raw trip counts...
   📅 Processing 2020... 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def fixed_foot_traffic_ml_pipeline():
    """
    FIXED ML pipeline that works with your actual data structure
    """
    
    print("🚀 FIXED Foot Traffic ML Pipeline")
    print("=" * 50)
    
    # Step 1: Load data with better error handling
    print("\n📂 Step 1: Loading Data...")
    
    df = None
    use_ml_features = False
    
    try:
        df = pd.read_csv('foot_traffic_ml_ready.csv')
        print(f"✅ Loaded: foot_traffic_ml_ready.csv")
        use_ml_features = True
    except FileNotFoundError:
        try:
            df = pd.read_csv('foot_traffic_ml_optimized.csv')
            print(f"✅ Loaded: foot_traffic_ml_optimized.csv")
            use_ml_features = False
        except FileNotFoundError:
            print("❌ Could not find data files!")
            return None
    
    print(f"📊 Original dataset shape: {df.shape}")
    print(f"📊 Columns: {df.columns.tolist()}")
    print(f"📊 Sample data:")
    print(df.head())
    
    # Step 2: Reshape data properly
    print(f"\n🔄 Step 2: Reshaping Data for ML...")
    
    # Identify time period columns automatically
    time_columns = []
    for col in df.columns:
        if any(period in col for period in ['morning', 'afternoon', 'evening', 'night']) and any(str(year) in col for year in [2020, 2021, 2022, 2023]):
            time_columns.append(col)
    
    print(f"📊 Found time period columns: {len(time_columns)}")
    print(f"📊 Examples: {time_columns[:5]}")
    
    # Reshape to long format
    reshaped_data = []
    
    for _, row in df.iterrows():
        geoid = row['GEOID']
        
        for col in time_columns:
            if pd.notna(row[col]):  # Only process non-NaN values
                # Parse column name to extract period and year
                parts = col.split('_')
                if len(parts) >= 2:
                    period = parts[0]
                    year = int(parts[1])
                    score = row[col]
                    
                    # Create ML row
                    ml_row = {
                        'GEOID': geoid,
                        'year': year,
                        'time_period': period,
                        'foot_traffic_score': score
                    }
                    
                    # Add trend features if available
                    if use_ml_features:
                        trend_cols = [c for c in df.columns if 'trend' in c or 'growth' in c or 'prefers' in c]
                        for trend_col in trend_cols:
                            if trend_col in df.columns:
                                ml_row[trend_col] = row[trend_col]
                        
                        # Add overall average if available
                        if 'overall_avg' in df.columns:
                            ml_row['overall_avg'] = row['overall_avg']
                    
                    reshaped_data.append(ml_row)
    
    print(f"📊 Created {len(reshaped_data)} ML training rows")
    
    if len(reshaped_data) == 0:
        print("❌ No valid training data created!")
        print("🔍 Debug info:")
        print(f"   Time columns found: {time_columns}")
        print(f"   Sample values: {[df[col].iloc[0] if col in df.columns else 'N/A' for col in time_columns[:3]]}")
        return None
    
    # Convert to DataFrame
    ml_df = pd.DataFrame(reshaped_data)
    
    print(f"📊 ML DataFrame shape: {ml_df.shape}")
    print(f"📊 Unique GEOIDs: {ml_df['GEOID'].nunique()}")
    print(f"📊 Years: {sorted(ml_df['year'].unique())}")
    print(f"📊 Time periods: {sorted(ml_df['time_period'].unique())}")
    print(f"📊 Score range: {ml_df['foot_traffic_score'].min():.3f} to {ml_df['foot_traffic_score'].max():.3f}")
    
    # Step 3: Feature engineering
    print(f"\n🔧 Step 3: Feature Engineering...")
    
    # Encode categorical variables
    le_geoid = LabelEncoder()
    le_period = LabelEncoder()
    
    ml_df['geoid_encoded'] = le_geoid.fit_transform(ml_df['GEOID'])
    ml_df['period_encoded'] = le_period.fit_transform(ml_df['time_period'])
    
    # Create time-based features
    ml_df['year_normalized'] = (ml_df['year'] - ml_df['year'].min()) / (ml_df['year'].max() - ml_df['year'].min())
    
    # Basic features that always exist
    base_features = ['geoid_encoded', 'period_encoded', 'year_normalized']
    
    # Add trend features if available
    feature_columns = base_features.copy()
    if use_ml_features:
        trend_features = [col for col in ml_df.columns if col not in ['GEOID', 'year', 'time_period', 'foot_traffic_score'] + base_features]
        feature_columns.extend(trend_features)
        print(f"✅ Added {len(trend_features)} trend features")
    
    target_column = 'foot_traffic_score'
    
    print(f"📋 Final features ({len(feature_columns)}): {feature_columns}")
    
    # Prepare final data
    X = ml_df[feature_columns].copy()
    y = ml_df[target_column].copy()
    
    # Handle any remaining NaN values
    X = X.fillna(X.median())
    
    print(f"📊 Final training data: X={X.shape}, y={y.shape}")
    
    # Step 4: Model testing
    print(f"\n🤖 Step 4: Testing Models...")
    
    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42, n_estimators=100),
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "DecisionTree": DecisionTreeRegressor(random_state=42),
        "KNN": KNeighborsRegressor(n_neighbors=5)
    }
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"📊 Training set: {X_train.shape}")
    print(f"📊 Test set: {X_test.shape}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Test models
    results = []
    best_model = None
    best_score = -np.inf
    best_model_obj = None
    
    for name, model in models.items():
        try:
            print(f"\n--- Training {name} ---")
            
            # Fit model
            model.fit(X_train_scaled, y_train)
            
            # Predictions
            y_pred = model.predict(X_test_scaled)
            
            # Metrics
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            
            results.append({
                'Model': name,
                'R²': r2,
                'MAE': mae,
                'RMSE': rmse
            })
            
            print(f"{name} → R²: {r2:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}")
            
            # Track best model
            if r2 > best_score:
                best_score = r2
                best_model = name
                best_model_obj = model
                
        except Exception as e:
            print(f"❌ {name} failed: {e}")
    
    # Step 5: Results
    print(f"\n🏆 Step 5: Results Summary")
    print("=" * 60)
    
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('R²', ascending=False)
    
    for _, row in results_df.iterrows():
        print(f"{row['Model'].ljust(20)} | R²: {row['R²']:.4f} | MAE: {row['MAE']:.4f} | RMSE: {row['RMSE']:.4f}")
    
    print(f"\n🥇 Best Model: {best_model} (R² = {best_score:.4f})")
    
    # Step 6: Generate predictions
    print(f"\n🔮 Step 6: Generating Future Predictions...")
    
    future_years = [2025, 2026, 2027]
    unique_geoids = df['GEOID'].unique()
    unique_periods = ml_df['time_period'].unique()
    
    print(f"📍 Predicting for {len(unique_geoids)} GEOIDs, {len(unique_periods)} time periods, {len(future_years)} years")
    
    # Generate predictions
    future_predictions = []
    
    for year in future_years:
        for geoid in unique_geoids:
            for period in unique_periods:
                # Create prediction row
                pred_row = {
                    'geoid_encoded': le_geoid.transform([geoid])[0],
                    'period_encoded': le_period.transform([period])[0],
                    'year_normalized': (year - ml_df['year'].min()) / (ml_df['year'].max() - ml_df['year'].min())
                }
                
                # Add trend features if available
                if use_ml_features:
                    geoid_data = df[df['GEOID'] == geoid].iloc[0]
                    for feat in trend_features:
                        if feat in geoid_data:
                            pred_row[feat] = geoid_data[feat]
                        else:
                            pred_row[feat] = 0
                
                # Make prediction
                pred_features = pd.DataFrame([pred_row])[feature_columns]
                pred_features = pred_features.fillna(pred_features.median())
                pred_scaled = scaler.transform(pred_features)
                
                prediction = best_model_obj.predict(pred_scaled)[0]
                
                future_predictions.append({
                    'GEOID': geoid,
                    'year': year,
                    'time_period': period,
                    'predicted_score': prediction
                })
    
    # Step 7: Format output
    print(f"\n📊 Step 7: Formatting Output...")
    
    pred_df = pd.DataFrame(future_predictions)
    
    # Pivot to database format
    pivot_df = pred_df.pivot_table(
        index='GEOID',
        columns=['time_period', 'year'],
        values='predicted_score',
        fill_value=0
    )
    
    # Flatten column names
    pivot_df.columns = [f'{period}_pred_{year}' for period, year in pivot_df.columns]
    pivot_df = pivot_df.reset_index()
    
    # Add average predictions
    for year in future_years:
        year_cols = [col for col in pivot_df.columns if f'pred_{year}' in col and 'average' not in col]
        if year_cols:
            pivot_df[f'average_pred_{year}'] = pivot_df[year_cols].mean(axis=1)
    
    # Ensure 0-10 range
    pred_cols = [col for col in pivot_df.columns if 'pred_' in col]
    for col in pred_cols:
        pivot_df[col] = np.clip(pivot_df[col], 0, 10)
    
    print(f"📊 Final output shape: {pivot_df.shape}")
    print(f"📊 Prediction columns: {[col for col in pivot_df.columns if 'pred_' in col]}")
    
    # Step 8: Create final format matching target structure
    print(f"\n📊 Step 8: Creating Final Database Format...")
    
    # Start fresh with clean format
    unique_geoids = df['GEOID'].unique()
    clean_df = pd.DataFrame({'GEOID': unique_geoids})
    clean_df['id'] = range(1, len(clean_df) + 1)
    
    # Add historical data (2020-2023) from original data
    time_periods_clean = ['morning', 'afternoon', 'evening']  # Match target format
    historical_years = [2020, 2021, 2022, 2023]  # Only years you actually have
    
    for period in time_periods_clean:
        for year in historical_years:
            col_name = f'{period}_{year}'
            if col_name in df.columns:
                clean_df = clean_df.merge(
                    df[['GEOID', col_name]], 
                    on='GEOID', 
                    how='left'
                )
            else:
                # Fill missing years with zeros or interpolated values
                clean_df[col_name] = 0.0
    
    # Add average columns for historical years
    for year in historical_years:
        year_cols = [f'{period}_{year}' for period in time_periods_clean if f'{period}_{year}' in clean_df.columns]
        if year_cols:
            clean_df[f'average_{year}'] = clean_df[year_cols].mean(axis=1)
        else:
            clean_df[f'average_{year}'] = 0.0
    
    # Add predictions from ML model (2024-2027)
    prediction_years = [2024, 2025, 2026, 2027] 
    
    # Generate predictions for each year
    for year in prediction_years:
        print(f"   Adding predictions for {year}...")
        
        for period in time_periods_clean:
            period_predictions = []
            
            for geoid in unique_geoids:
                # Create prediction features
                pred_row = {
                    'geoid_encoded': le_geoid.transform([geoid])[0],
                    'period_encoded': le_period.transform([period])[0],
                    'year_normalized': (year - ml_df['year'].min()) / (ml_df['year'].max() - ml_df['year'].min())
                }
                
                # Add trend features if available
                if use_ml_features:
                    geoid_data = df[df['GEOID'] == geoid].iloc[0]
                    for feat in trend_features:
                        if feat in geoid_data:
                            pred_row[feat] = geoid_data[feat]
                        else:
                            pred_row[feat] = 0
                
                # Make prediction
                pred_features = pd.DataFrame([pred_row])[feature_columns]
                pred_features = pred_features.fillna(pred_features.median())
                pred_scaled = scaler.transform(pred_features)
                
                prediction = best_model_obj.predict(pred_scaled)[0]
                period_predictions.append(prediction)
            
            # Add to clean dataframe
            if year == 2024:
                col_name = f'{period}_{year}'
            else:
                col_name = f'{period}_pred_{year}'
            
            clean_df[col_name] = period_predictions
    
    # Add average predictions
    for year in prediction_years:
        if year == 2024:
            year_cols = [f'{period}_{year}' for period in time_periods_clean]
            clean_df[f'average_{year}'] = clean_df[year_cols].mean(axis=1)
        else:
            year_cols = [f'{period}_pred_{year}' for period in time_periods_clean]
            clean_df[f'average_pred_{year}'] = clean_df[year_cols].mean(axis=1)
    
    # Scale predictions to 0-10 range
    pred_cols = []
    for year in prediction_years:
        for period in time_periods_clean:
            if year == 2024:
                pred_cols.append(f'{period}_{year}')
            else:
                pred_cols.append(f'{period}_pred_{year}')
        
        if year == 2024:
            pred_cols.append(f'average_{year}')
        else:
            pred_cols.append(f'average_pred_{year}')
    
    # Apply scaling to predictions only
    if pred_cols:
        pred_data = clean_df[pred_cols]
        current_max = pred_data.max().max()
        if current_max > 0:
            scale_factor = 10.0 / current_max
            clean_df[pred_cols] = clean_df[pred_cols] * scale_factor
            print(f"🔧 Applied scale factor: {scale_factor:.3f}")
    
    # Ensure exact column order matching target format
    expected_columns = ['id', 'GEOID']
    
    # Add time period columns in order (2020-2023 + 2024-2027)
    all_years = [2020, 2021, 2022, 2023, 2024, 'pred_2025', 'pred_2026', 'pred_2027']
    
    for period in time_periods_clean:
        for year in all_years:
            if year == 2024:
                col_name = f'{period}_{year}'
            elif isinstance(year, str):  # pred_YYYY
                col_name = f'{period}_{year}'
            else:
                col_name = f'{period}_{year}'
            
            if col_name in clean_df.columns:
                expected_columns.append(col_name)
    
    # Add average columns
    for year in all_years:
        if year == 2024:
            col_name = f'average_{year}'
        elif isinstance(year, str):  # pred_YYYY
            col_name = f'average_{year}'
        else:
            col_name = f'average_{year}'
        
        if col_name in clean_df.columns:
            expected_columns.append(col_name)
    
    # Reorder and clean up
    for col in expected_columns:
        if col not in clean_df.columns:
            clean_df[col] = 0.0
    
    final_clean_df = clean_df[expected_columns]
    
    # Save to exact target filename
    output_filename = 'tract_foot_traffic_trends_rows 1.csv'
    final_clean_df.to_csv(output_filename, index=False)
    print(f"✅ Saved: {output_filename}")
    
    print(f"📊 Final clean format: {final_clean_df.shape}")
    print(f"📊 Columns: {len(final_clean_df.columns)} (target: 34)")
    
    # Show sample
    print(f"\n📋 Sample of clean output:")
    sample_cols = ['id', 'GEOID'] + [col for col in final_clean_df.columns if '2024' in col or 'pred_2025' in col][:6]
    print(final_clean_df[sample_cols].head())
    
    print(f"\n🎉 FIXED Pipeline Complete!")
    print(f"🏆 Best model: {best_model} (R² = {best_score:.4f})")
    print(f"📊 Output ready for database!")
    
    # Show sample
    print(f"\n📋 Sample of clean output:")
    sample_cols = ['id', 'GEOID'] + [col for col in final_clean_df.columns if 'pred_2025' in col or '2024' in col][:6]
    print(final_clean_df[sample_cols].head())
    
    return {
        'best_model': best_model,
        'best_score': best_score,
        'predictions': final_clean_df,
        'encoders': {'geoid': le_geoid, 'period': le_period},
        'scaler': scaler
    }

# Run the fixed pipeline
if __name__ == "__main__":
    results = fixed_foot_traffic_ml_pipeline()

🚀 FIXED Foot Traffic ML Pipeline

📂 Step 1: Loading Data...
✅ Loaded: foot_traffic_ml_ready.csv
📊 Original dataset shape: (49, 38)
📊 Columns: ['id', 'GEOID', 'morning_2020', 'afternoon_2020', 'evening_2020', 'night_2020', 'average_2020', 'morning_2021', 'afternoon_2021', 'evening_2021', 'night_2021', 'average_2021', 'morning_2022', 'afternoon_2022', 'evening_2022', 'night_2022', 'average_2022', 'morning_2023', 'afternoon_2023', 'evening_2023', 'night_2023', 'average_2023', 'overall_avg', 'morning_trend_slope', 'morning_growth_rate', 'afternoon_trend_slope', 'afternoon_growth_rate', 'evening_trend_slope', 'evening_growth_rate', 'night_trend_slope', 'night_growth_rate', 'average_trend_slope', 'average_growth_rate', 'prefers_morning', 'prefers_afternoon', 'prefers_evening', 'prefers_night', 'peak_period']
📊 Sample data:
   id  GEOID  morning_2020  afternoon_2020  evening_2020  night_2020  \
0   1      4      0.099753        0.227694      0.276751    0.101110   
1   2     12      0.027886 