<a href="https://colab.research.google.com/github/anugrahatY/RaspberryPunch/blob/main/data/raw/SolarData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================
# STEP 0: Install dependencies
# ==============================
!pip install geopandas xarray netCDF4 requests pandas

import geopandas as gpd
import xarray as xr
import pandas as pd
import requests


Collecting netCDF4
  Downloading netCDF4-1.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.4.post1 netCDF4-1.7.2


In [4]:
# ==============================
# STEP 1: Load Solar Farm Mapping (GeoJSON from Microsoft)
# ==============================
# Make sure you upload or clone "solar-farms-mapping" repo in Colab first.
farms = gpd.read_file("/content/solar_farms_india_2021.geojson")

# Pick the first farm (you can change index)
farm = farms.iloc[0]
lat, lon = farm.geometry.centroid.y, farm.geometry.centroid.x
print(f"Selected Farm Coordinates: Lat={lat}, Lon={lon}")


Selected Farm Coordinates: Lat=19.148300269779078, Lon=79.42635625208653


In [9]:
#ds = xr.open_dataset("/content/solar_capacity_factor_2022.nc")
#print(ds)

<xarray.Dataset> Size: 580MB
Dimensions:                   (time: 7296, lat: 141, lon: 141)
Coordinates:
  * time                      (time) datetime64[ns] 58kB 2022-01-01 ... 2022-...
  * lat                       (lat) float32 564B 40.0 39.75 39.5 ... 5.25 5.0
  * lon                       (lon) float32 564B 65.0 65.25 65.5 ... 99.75 100.0
Data variables:
    capacity_factor_of_panel  (time, lat, lon) float32 580MB ...


In [11]:
# ==============================
# STEP 2: Load Zenodo Renewable Production Data (NetCDF)
# ==============================
# Upload the Zenodo NetCDF file (e.g., India_Solar_CF_Hourly_1979-2022.nc)
ds = xr.open_dataset("/content/solar_capacity_factor_2022.nc")

# Extract nearest capacity factor time-series
cf = ds.sel(lon=lon, lat=lat, method="nearest")["capacity_factor_of_panel"]

# Convert to pandas DataFrame
cf_df = cf.to_dataframe().reset_index()
cf_df = cf_df.rename(columns={"capacity_factor_of_panel": "CF"})
cf_df["Generation_MW"] = cf_df["CF"] * 50  # assume 50 MW installed capacity

print("Zenodo CF sample:")
print(cf_df.head())


Zenodo CF sample:
                 time    lat   lon            CF  Generation_MW
0 2022-01-01 00:00:00  19.25  79.5 -4.094508e-07      -0.000020
1 2022-01-01 01:00:00  19.25  79.5 -4.098415e-07      -0.000020
2 2022-01-01 02:00:00  19.25  79.5  2.524692e-02       1.262346
3 2022-01-01 03:00:00  19.25  79.5  1.746623e-01       8.733114
4 2022-01-01 04:00:00  19.25  79.5  3.620194e-01      18.100973


In [23]:
import requests
import pandas as pd

url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
params = {
    "parameters": "T2M,ALLSKY_SFC_SW_DWN",
    "start": "20200101",
    "end": "20200107",
    "latitude": lat,
    "longitude": lon,
    "community": "RE",
    "format": "CSV"
}

# Fetch data
response = requests.get(url, params=params)
with open("nasa_power.csv", "wb") as f:
    f.write(response.content)

# Load with fixed column names
col_names = ["YEAR", "MO", "DY", "HR", "Irradiance_Wm2", "Temp_C"]
weather_df = pd.read_csv("nasa_power.csv", skiprows=13, names=col_names)

# Build datetime column
weather_df["time"] = pd.to_datetime(weather_df[["YEAR", "MO", "DY", "HR"]].astype(str).agg("-".join, axis=1),
                                    format="%Y-%m-%d-%H")

# Keep only useful columns
weather_df = weather_df[["time", "Irradiance_Wm2", "Temp_C"]]

print(weather_df.head())



                 time  Irradiance_Wm2  Temp_C
0 2020-01-01 02:00:00            5.26     0.0
1 2020-01-01 03:00:00            5.09     0.0
2 2020-01-01 04:00:00            4.76     0.0
3 2020-01-01 05:00:00            4.30     0.0
4 2020-01-01 06:00:00            3.98     0.0


In [26]:
# ==============================
# STEP 3: Fetch NASA POWER Data (Irradiance + Temp)
# ==============================
# Define API URL for hourly data (example: Jan 2020)
url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
params = {
    "parameters": "T2M,ALLSKY_SFC_SW_DWN",
    "start": "20220101",
    "end": "20220131",
    "latitude": lat,
    "longitude": lon,
    "community": "RE",   # Renewable Energy community
    "format": "CSV"
}

# Fetch data
response = requests.get(url, params=params)
with open("nasa_power.csv", "wb") as f:
    f.write(response.content)

# Load with fixed column names
col_names = ["YEAR", "MO", "DY", "HR", "Irradiance_Wm2", "Temp_C"]
weather_df = pd.read_csv("nasa_power.csv", skiprows=13, names=col_names)

# Build datetime column
weather_df["time"] = pd.to_datetime(weather_df[["YEAR", "MO", "DY", "HR"]].astype(str).agg("-".join, axis=1),
                                    format="%Y-%m-%d-%H")

# Read and clean CSV
#weather_df = pd.read_csv("nasa_power.csv", skiprows=11)

# Rename columns
weather_df = weather_df.rename(columns={
    "ALLSKY_SFC_SW_DWN": "Irradiance_Wm2",
    "T2M": "Temp_C"
})
print(weather_df.columns)
print(weather_df.head())

# Parse datetime (HR goes 1–24, so shift to 0–23)
#weather_df["time"] = pd.to_datetime(
#    weather_df["YEAR"].astype(str) + "-" +
#   weather_df["MO"].astype(str) + "-" +
#    weather_df["DY"].astype(str) + " " +
#   (weather_df["HR"] - 1).astype(str) + ":00:00"
#)

# Keep only relevant columns
weather_df = weather_df[["time", "Irradiance_Wm2", "Temp_C"]]

print("NASA POWER sample:")
print(weather_df.head())


Index(['YEAR', 'MO', 'DY', 'HR', 'Irradiance_Wm2', 'Temp_C', 'time'], dtype='object')
   YEAR  MO  DY  HR  Irradiance_Wm2  Temp_C                time
0  2022   1   1   2            8.32     0.0 2022-01-01 02:00:00
1  2022   1   1   3            7.74     0.0 2022-01-01 03:00:00
2  2022   1   1   4            7.05     0.0 2022-01-01 04:00:00
3  2022   1   1   5            6.39     0.0 2022-01-01 05:00:00
4  2022   1   1   6            5.93     0.0 2022-01-01 06:00:00
NASA POWER sample:
                 time  Irradiance_Wm2  Temp_C
0 2022-01-01 02:00:00            8.32     0.0
1 2022-01-01 03:00:00            7.74     0.0
2 2022-01-01 04:00:00            7.05     0.0
3 2022-01-01 05:00:00            6.39     0.0
4 2022-01-01 06:00:00            5.93     0.0


In [27]:
# ==============================
# STEP 4: Merge Zenodo + NASA POWER
# ==============================
cf_df["time"] = pd.to_datetime(cf_df["time"])
merged = pd.merge(cf_df, weather_df, on="time", how="inner")

print("Final merged dataset sample:")
print(merged.head())

# ==============================
# STEP 5: Save to CSV
# ==============================
merged.to_csv("linked_farm_dataset.csv", index=False)
print("✅ Linked dataset saved as linked_farm_dataset.csv")


Final merged dataset sample:
                 time    lat   lon        CF  Generation_MW  Irradiance_Wm2  \
0 2022-01-01 02:00:00  19.25  79.5  0.025247       1.262346            8.32   
1 2022-01-01 03:00:00  19.25  79.5  0.174662       8.733114            7.74   
2 2022-01-01 04:00:00  19.25  79.5  0.362019      18.100973            7.05   
3 2022-01-01 05:00:00  19.25  79.5  0.516540      25.826994            6.39   
4 2022-01-01 06:00:00  19.25  79.5  0.617053      30.852661            5.93   

   Temp_C  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  
✅ Linked dataset saved as linked_farm_dataset.csv
