<a href="https://colab.research.google.com/github/abhy-kumar/Raydium/blob/main/Raydium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install xgboost



In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests
import time
from typing import Dict, List, Tuple
import logging
import json
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

class NASAPowerAPI:
    def __init__(self):
        self.base_url = "https://power.larc.nasa.gov/api/temporal/daily/point"
        self.logger = self._setup_logger()

    def _setup_logger(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.DEBUG)  # Set to DEBUG to capture all levels
        # Create console handler with a higher log level
        ch = logging.StreamHandler()
        ch.setLevel(logging.INFO)
        # Create formatter and add it to the handlers
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        # Add the handlers to the logger
        if not logger.handlers:
            logger.addHandler(ch)
        return logger

    def fetch_solar_data(self, lat: float, lon: float, start_date: str, end_date: str) -> Dict:
        """
        Fetch solar and meteorological data from NASA POWER API
        Parameters:
        - lat, lon: location coordinates
        - start_date, end_date: in format 'YYYYMMDD'
        """
        parameters = [
            "ALLSKY_SFC_SW_DWN",  # Solar radiation
            "T2M",                 # Temperature at 2 meters
            "RH2M",               # Relative humidity at 2 meters
            "CLOUD_AMT",          # Cloud amount
            "ALLSKY_KT",          # Clearness index
            "WS2M"                # Wind speed at 2 meters
        ]

        params = {
            "start": start_date,
            "end": end_date,
            "latitude": lat,
            "longitude": lon,
            "community": "RE",
            "parameters": ",".join(parameters),
            "format": "JSON"
        }

        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            self.logger.debug(f"Fetched data for lat={lat}, lon={lon} from {start_date} to {end_date}")
            return response.json()
        except Exception as e:
            self.logger.error(f"Error fetching data: {str(e)}")
            return None
    pass

class SolarAnalyzer:
    def __init__(self):
        self.nasa_api = NASAPowerAPI()
        self.model = None
        self.scaler = StandardScaler()
        self.logger = logging.getLogger(__name__)

        # Adjusted constraints for Gujarat's climate
        self.constraints = {
            'temperature': {'min': 0, 'max': 50},     # Wider temperature range
            'humidity': {'min': 10, 'max': 100},      # Full humidity range
            'solar_radiation': {'min': 0, 'max': 1500},  # Increased max solar radiation
            'cloud_cover': {'min': 0, 'max': 100},    # Full cloud cover range
            'clearness_index': {'min': 0, 'max': 1.0}, # Full clearness range
            'wind_speed': {'min': 0, 'max': 50}       # Kept the same
        }

        # Gujarat districts data remains the same
        self.gujarat_districts = {
            'Ahmedabad': {
                'lat': 23.0225,
                'lon': 72.5714,
                'region': 'Central',
                'area': 8087,
                'land_usage_factor': 0.02
            },
            'Surat': {
                'lat': 21.1702,
                'lon': 72.8311,
                'region': 'South',
                'area': 7657,
                'land_usage_factor': 0.015
            },
            'Vadodara': {
                'lat': 22.3072,
                'lon': 73.1812,
                'region': 'Central',
                'area': 7794,
                'land_usage_factor': 0.018
            },
            'Rajkot': {
                'lat': 22.3039,
                'lon': 70.8022,
                'region': 'Saurashtra',
                'area': 11203,
                'land_usage_factor': 0.025
            },
            'Bhavnagar': {
                'lat': 21.7645,
                'lon': 72.1519,
                'region': 'Saurashtra',
                'area': 11155,
                'land_usage_factor': 0.023
            },
            'Kutch': {
                'lat': 23.2419,
                'lon': 69.6669,
                'region': 'North',
                'area': 45652,
                'land_usage_factor': 0.035
            },
            'Banaskantha': {
                'lat': 24.1747,
                'lon': 72.4367,
                'region': 'North',
                'area': 12703,
                'land_usage_factor': 0.028
            }
        }
        # Adjusted constraints for more realistic ranges
        self.constraints = {
            'temperature': {'min': -5, 'max': 55},     # Wider temperature range
            'humidity': {'min': 5, 'max': 100},        # Broader humidity range
            'solar_radiation': {'min': 0, 'max': 1500},  # Keep max solar radiation
            'cloud_cover': {'min': 0, 'max': 100},     # Full cloud cover range
            'clearness_index': {'min': 0.1, 'max': 1.0}, # Minimum clearness of 0.1
            'wind_speed': {'min': 0, 'max': 50}        # Keep wind speed range
        }

    def calculate_solar_power(self, df: pd.DataFrame) -> pd.Series:
        """Calculate theoretical solar power output per square meter with improved handling"""
        panel_efficiency = 0.20  # Using 20% efficient panels

        # Ensure all inputs are within valid ranges
        solar_radiation = df['solar_radiation'].clip(lower=0, upper=1500)
        temperature = df['temperature'].clip(lower=-5, upper=55)
        cloud_cover = df['cloud_cover'].clip(lower=0, upper=100) / 100  # Convert to 0-1 scale
        clearness_index = df['clearness_index'].clip(lower=0.1, upper=1.0)

        # Convert W/m² to kWh/m²/day
        energy = solar_radiation * 24 / 1000

        # Apply environmental factors with more conservative impacts
        energy *= (1 - cloud_cover * 0.03)  # Reduced cloud cover impact (3% per 100% coverage)
        energy *= clearness_index

        # Temperature effect (efficiency drops by 0.4% per °C above 25°C)
        temp_factor = 1 - 0.004 * (temperature - 25).clip(lower=0)
        temp_factor = temp_factor.clip(lower=0.7)  # Maximum efficiency loss of 30%
        energy *= temp_factor

        # Calculate power with panel efficiency
        power = energy * panel_efficiency  # kWh/m²/day

        return power

    def calculate_district_potential(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate solar power potential with improved scaling"""
        # Create new columns for district data
        df['district_area'] = 0.0
        df['usable_area'] = 0.0
        df['total_potential'] = 0.0

        for district in self.gujarat_districts:
            # Calculate areas
            area_m2 = self.gujarat_districts[district]['area'] * 1_000_000  # km² to m²
            usable_area = area_m2 * self.gujarat_districts[district]['land_usage_factor']

            # Create mask for current district
            mask = df['district'] == district

            # Update values for the district
            df.loc[mask, 'district_area'] = self.gujarat_districts[district]['area']
            df.loc[mask, 'usable_area'] = usable_area / 1_000_000  # Convert back to km² for readability

            # Calculate total potential in MW (1000 kW = 1 MW)
            # Assuming 1000 W/m² standard test conditions
            df.loc[mask, 'total_potential'] = (
                df.loc[mask, 'solar_power'] *  # kWh/m²/day
                usable_area *                  # m²
                1000 /                         # Convert to Watts
                24                            # Convert to average MW
            )

        return df

    def validate_and_clean_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
        """Validate and clean data with improved handling of edge cases"""
        total_records = len(df)
        validation_stats = {'total_records': total_records, 'removed_records': {}}

        # Create a copy of the DataFrame to avoid modifying the original
        df_clean = df.copy()

        # Replace infinity and negative infinity with NaN
        df_clean = df_clean.replace([np.inf, -np.inf], np.nan)

        # Initialize a mask for valid records
        valid_mask = pd.Series(True, index=df.index)

        # Check each constraint
        for feature, limits in self.constraints.items():
            invalid_mask = (
                (df_clean[feature] < limits['min']) |
                (df_clean[feature] > limits['max']) |
                df_clean[feature].isna()
            )
            validation_stats['removed_records'][feature] = invalid_mask.sum()
            valid_mask &= ~invalid_mask

        # Apply the combined mask
        df_clean = df_clean[valid_mask]

        # Add month column for seasonal analysis
        df_clean['month'] = pd.to_datetime(df_clean['date']).dt.month

        validation_stats['records_after_cleaning'] = len(df_clean)

        # Calculate and store percentage of data retained
        validation_stats['data_retention_rate'] = (len(df_clean) / total_records) * 100

        return df_clean, validation_stats

    def train_model(self, df: pd.DataFrame) -> Dict[str, float]:
        """Train model and return comprehensive metrics"""
        df = df.dropna()

        X = df[['temperature', 'humidity', 'cloud_cover', 'clearness_index', 'wind_speed']]
        y = df['solar_power']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale features
        self.scaler.fit(X_train)
        X_train_scaled = self.scaler.transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Train model
        self.model = RandomForestRegressor(
            n_estimators=100,
            max_depth=15,
            min_samples_split=5,
            random_state=42
        )
        self.model.fit(X_train_scaled, y_train)

        # Calculate predictions
        y_pred = self.model.predict(X_test_scaled)

        # Calculate metrics
        metrics = {
            'mse': mean_squared_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'mae': mean_absolute_error(y_test, y_pred),
            'r2': r2_score(y_test, y_pred),
            'cv_scores': cross_val_score(self.model, X_train_scaled, y_train, cv=5)
        }

        # Feature importance
        feature_importance = dict(zip(X.columns, self.model.feature_importances_))
        metrics['feature_importance'] = feature_importance

        return metrics

    def create_visualizations(self, df: pd.DataFrame):
        """Create enhanced analysis visualizations"""
        # Add district coordinates to the dataframe
        district_coords = pd.DataFrame([
            {
                'district': district,
                'latitude': info['lat'],
                'longitude': info['lon']
            }
            for district, info in self.gujarat_districts.items()
          ])
              # 1. Regional Potential Distribution
        fig1 = px.box(
            df,
            x='region',
            y='total_potential',
            title='Solar Power Potential Distribution by Region',
            labels={
                'total_potential': 'Total Potential (MW)',
                'region': 'Region'
            }
        )
        # 2. District Comparison - calculate summary stats first
        district_summary = df.groupby('district').agg({
            'total_potential': 'mean',
            'district_area': 'first',
            'usable_area': 'first'
        }).reset_index()

        # Merge with coordinates
        district_summary = district_summary.merge(district_coords, on='district', how='left')

        fig2 = px.bar(
            district_summary,
            x='district',
            y='total_potential',
            color='usable_area',
            title='Average Solar Potential by District',
            labels={
                'total_potential': 'Average Potential (MW)',
                'district': 'District',
                'usable_area': 'Usable Area (km²)'
            }
        )
        # 3. Geographic Distribution
        fig3 = px.scatter_mapbox(
            district_summary,
            lat='latitude',
            lon='longitude',
            size='total_potential',
            color='usable_area',
            hover_name='district',
            hover_data=['total_potential', 'district_area'],
            title='Solar Potential Distribution Across Gujarat',
            mapbox_style='carto-positron',
            size_max=25
        )
        # 4. Seasonal Variation
        df['month'] = pd.to_datetime(df['date']).dt.month
        monthly_potential = df.groupby(['district', 'month'])['total_potential'].mean().reset_index()
        fig4 = px.line(
            monthly_potential,
            x='month',
            y='total_potential',
            color='district',
            title='Monthly Solar Potential by District',
            labels={
                'month': 'Month',
                'total_potential': 'Average Potential (MW)',
                'district': 'District'
            }
        )

        return fig1, fig2, fig3, fig4

def main():
    analyzer = SolarAnalyzer()
    end_date = datetime.now()
    start_date = end_date - timedelta(days=3650)

    print("Fetching solar data from NASA POWER API...")
    data_list = []
    for district, info in analyzer.gujarat_districts.items():
        data = analyzer.nasa_api.fetch_solar_data(
            info['lat'],
            info['lon'],
            start_date.strftime('%Y%m%d'),
            end_date.strftime('%Y%m%d')
        )
        if data and 'properties' in data and 'parameter' in data['properties']:
            daily_data = data['properties']['parameter']
            dates = pd.date_range(start=start_date, end=end_date)
            for date in dates:
                date_str = date.strftime('%Y%m%d')
                required_params = ['ALLSKY_SFC_SW_DWN', 'T2M', 'RH2M', 'CLOUD_AMT', 'ALLSKY_KT', 'WS2M']
                if all(param in daily_data and date_str in daily_data[param] for param in required_params):
                    try:
                        day_data = {
                            'date': date,
                            'district': district,
                            'region': info['region'],
                            'latitude': info['lat'],
                            'longitude': info['lon'],
                            'solar_radiation': daily_data['ALLSKY_SFC_SW_DWN'][date_str],
                            'temperature': daily_data['T2M'][date_str],
                            'humidity': daily_data['RH2M'][date_str],
                            'cloud_cover': daily_data['CLOUD_AMT'][date_str],
                            'clearness_index': daily_data['ALLSKY_KT'][date_str],
                            'wind_speed': daily_data['WS2M'][date_str]
                        }
                        data_list.append(day_data)
                    except Exception as e:
                        print(f"Error processing date {date_str} for district {district}: {str(e)}")
                else:
                    print(f"Missing data for date {date_str} in district {district}. Skipping.")
        else:
            print(f"Invalid data structure received for district {district}.")

    if not data_list:
        print("No data fetched. Exiting program.")
        return

    df = pd.DataFrame(data_list)
    print(f"\nInitial dataframe shape: {df.shape}")
    print("\nColumns in dataframe:", df.columns.tolist())

    # Calculate solar power
    df['solar_power'] = analyzer.calculate_solar_power(df)

    # Clean and validate data
    df, validation_stats = analyzer.validate_and_clean_data(df)
    print("\nDataframe shape after cleaning:", df.shape)

    # Calculate district potential
    df = analyzer.calculate_district_potential(df)
    print("\nDataframe shape after potential calculation:", df.shape)
    print("\nFinal columns:", df.columns.tolist())

    # Print sample of data to verify calculations
    print("\nSample of final data:")
    print(df[['district', 'total_potential', 'district_area', 'usable_area']].head())

    print("\nCreating visualizations...")
    fig1, fig2, fig3, fig4 = analyzer.create_visualizations(df)

    # Display visualizations
    fig1.show()
    fig2.show()
    fig3.show()
    fig4.show()

    # Save data to CSV
    df.to_csv('gujarat_solar_data.csv', index=False)
    print("\nData saved to gujarat_solar_data.csv")


if __name__ == "__main__":
    main()

Fetching solar data from NASA POWER API...


DEBUG:__main__:Fetched data for lat=23.0225, lon=72.5714 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=21.1702, lon=72.8311 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=22.3072, lon=73.1812 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=22.3039, lon=70.8022 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=21.7645, lon=72.1519 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=23.2419, lon=69.6669 from 20141101 to 20241029
DEBUG:__main__:Fetched data for lat=24.1747, lon=72.4367 from 20141101 to 20241029



Initial dataframe shape: (25557, 11)

Columns in dataframe: ['date', 'district', 'region', 'latitude', 'longitude', 'solar_radiation', 'temperature', 'humidity', 'cloud_cover', 'clearness_index', 'wind_speed']

Dataframe shape after cleaning: (24675, 13)

Dataframe shape after potential calculation: (24675, 16)

Final columns: ['date', 'district', 'region', 'latitude', 'longitude', 'solar_radiation', 'temperature', 'humidity', 'cloud_cover', 'clearness_index', 'wind_speed', 'solar_power', 'month', 'district_area', 'usable_area', 'total_potential']

Sample of final data:
    district  total_potential  district_area  usable_area
0  Ahmedabad     5.934580e+07         8087.0       161.74
1  Ahmedabad     8.793030e+07         8087.0       161.74
2  Ahmedabad     8.526361e+07         8087.0       161.74
3  Ahmedabad     1.009334e+08         8087.0       161.74
4  Ahmedabad     1.239958e+08         8087.0       161.74

Creating visualizations...



Data saved to gujarat_solar_data.csv
