In [None]:
import pandas as pd
import re
import requests
import time
import os
from shapely.geometry import Point
from math import radians, cos, sin, sqrt, atan2
from fastkml import kml
import geopandas as gpd
from pykml.factory import KML_ElementMaker as KML
from lxml import etree
from shapely.geometry import Polygon


# Fetching Data

### The following cells are commented because fetching the data took about 4 hours

In [None]:
"""
def fetch_nasa_power_data_for_point(lat, lon, start="2000-01-01", end="2024-12-31"):
    parameters = [
        "ALLSKY_SFC_SW_DWN",   # Surface Shortwave Downward Irradiance
        "ALLSKY_KT",           # Clearness Index
        "CLOUD_AMT",           # Cloud Amount
        "PRECTOTCORR"          # Precipitation
    ]

    url = (
        f"https://power.larc.nasa.gov/api/temporal/monthly/point"
        f"?start={start.replace('-', '')}&end={end.replace('-', '')}"
        f"&latitude={lat}&longitude={lon}"
        f"&community=AG"
        f"&parameters={','.join(parameters)}"
        f"&format=JSON"
    )

    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
        records = data["properties"]["parameter"]
        dates = list(records[parameters[0]].keys())

        results = []
        for date in dates:
            entry = {"lat": lat, "lon": lon, "date": date}
            for param in parameters:
                entry[param] = records[param][date]
            results.append(entry)

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error fetching data for ({lat}, {lon}): {e}")
        return pd.DataFrame()
   """

In [None]:
"""
df = fetch_nasa_power_data_for_point(42.002, 28.254, start="2000", end="2024")
print(df.head())
df.to_csv("nasa_test_point.csv", index=False)
"""

In [None]:
"""
df = pd.read_csv("assets/turkey_grid.csv")

batch_size = 200
total_points = len(df)

output_dir = "assets/nasa_datas"
os.makedirs(output_dir, exist_ok=True)

for batch_start in range(0, total_points, batch_size):
    batch_end = min(batch_start + batch_size, total_points)
    batch = df.iloc[batch_start:batch_end]

    all_results = []
    print(f"\nüîÑ Processing block {batch_start}‚Äì{batch_end - 1}")

    for i, row in batch.iterrows():
        lat, lon = row["lat"], row["lon"]
        print(f"‚Üí Fetching point {i+1}/{total_points}: ({lat}, {lon})")
        
        data = fetch_nasa_power_data_for_point(lat, lon, start="2020", end="2023")
        if not data.empty:
            all_results.append(data)
        time.sleep(0.5)

    if all_results:
        block_df = pd.concat(all_results, ignore_index=True)
        block_index = (batch_start // batch_size) + 1
        output_file = os.path.join(output_dir, f"nasa_block_{block_index}.csv")
        block_df.to_csv(output_file, index=False)
        print(f"‚úÖ Saved block to {output_file}")
    else:
        print(f"‚ö†Ô∏è No data collected for block {batch_start}‚Äì{batch_end - 1}")
"""


# Sorting and Merging Data

In [None]:
lat_start, lat_end = 35, 43
lon_start, lon_end = 25, 46
lat_step, lon_step = 1, 1

lat_points = [round(lat_start + i * lat_step, 4) for i in range(int((lat_end - lat_start) / lat_step) + 1)]
lon_points = [round(lon_start + i * lon_step, 4) for i in range(int((lon_end - lon_start) / lon_step) + 1)]

grid_points = [(lat, lon) for lat in lat_points for lon in lon_points]

In [None]:
def create_kml(gdf, filename):
    placemarks = []
    for _, row in gdf.iterrows():
        lon, lat = row.geometry.x, row.geometry.y
        placemarks.append(
            KML.Placemark(
                KML.name(f"{lat}, {lon}"),
                KML.Point(KML.coordinates(f"{lon},{lat},0"))
            )
        )

    kml_doc = KML.kml(KML.Document(*placemarks))
    with open(filename, "w", encoding="utf-8") as f:
        f.write(etree.tostring(kml_doc, pretty_print=True).decode("utf-8"))

In [None]:
geoms = [Point(lon, lat) for lat, lon in grid_points]
gdf = gpd.GeoDataFrame(geometry=geoms, crs="EPSG:4326")

create_kml(gdf, "assets/turkey_grid.kml")

gdf["lon"] = gdf.geometry.x
gdf["lat"] = gdf.geometry.y
gdf[["lat", "lon"]].to_csv("assets/turkey_grid.csv", index=False)

In [None]:

polygon_df = pd.read_csv("assets/Turkey.csv")
polygon_list = []
for row in polygon_df.iloc[:, 0]:
    coords = [(float(p.split(",")[0]), float(p.split(",")[1])) for p in row.strip().split()]
    polygon_list.append(Polygon(coords))

gdf["is_inside"] = gdf.geometry.apply(lambda point: any(point.within(poly) for poly in polygon_list))

inside_points = gdf[gdf["is_inside"]].copy()

# KML
create_kml(inside_points, "assets/Turkey_filtered_grid.kml")

# CSV
inside_points["lon"] = inside_points.geometry.x
inside_points["lat"] = inside_points.geometry.y
inside_points[["lat", "lon"]].to_csv("assets/istanbul_filtered_grid.csv", index=False)



In [None]:
df_lat_lon = pd.read_csv('assets/nasa_datas_sorted/merged_all.csv')
geoms = [Point(lon, lat) for lat, lon in zip(df_lat_lon['lat'], df_lat_lon['lon'])]
gdf = gpd.GeoDataFrame(geometry=geoms, crs="EPSG:4326")

gdf['lon'] = gdf.geometry.x
gdf['lat'] = gdf.geometry.y

polygon_df = pd.read_csv("assets/Turkey.csv")
polygon_list = []
for row in polygon_df.iloc[:, 0]:
    coords = [(float(p.split(",")[0]), float(p.split(",")[1])) for p in row.strip().split()]
    polygon_list.append(Polygon(coords))

gdf["is_inside"] = gdf.geometry.apply(lambda point: any(point.within(poly) for poly in polygon_list))

inside_points = gdf[gdf["is_inside"]].copy()

# KML
create_kml(inside_points, "assets/Turkey_filtered_grid.kml")

# CSV
inside_points["lon"] = inside_points.geometry.x
inside_points["lat"] = inside_points.geometry.y
inside_points[["lat", "lon"]].to_csv("assets/turkey_filtered_grid.csv", index=False)

In [None]:
input_dir = 'assets/nasa_datas'
output_dir = 'assets/nasa_datas_sorted'
output_file = os.path.join(output_dir, 'nasa_original_combined.csv')
parameters = ['ALLSKY_SFC_SW_DWN', 'ALLSKY_KT', 'CLOUD_AMT', 'PRECTOTCORR']
transformed_dfs = []
for i in range(1, 2):
    filename = f'nasa_block_{i}.csv'
    filepath = os.path.join(input_dir, filename)

    if not os.path.exists(filepath):
        continue

    df = pd.read_csv(filepath)
    pivot_dfs = []
    for param in parameters:
        pivot = df.pivot_table(index=['lat', 'lon'], columns='date', values=param)
        pivot.columns = [f"{param}_{col}" for col in pivot.columns]
        pivot_dfs.append(pivot)

    final_df = pd.concat(pivot_dfs, axis=1).reset_index()
    transformed_dfs.append(final_df)

combined_df = pd.concat(transformed_dfs, axis=0, ignore_index=True)

os.makedirs(output_dir, exist_ok=True)
combined_df.to_csv(output_file, index=False)

In [None]:
df_merged = pd.read_csv('assets/nasa_datas_sorted/nasa_original_combined.csv')
k = kml.KML()
doc = kml.Document(id='docid', name='Merged Points')

folder_original = kml.Folder(id='original_folder')

style_original = """
<Style id="original_style">
  <IconStyle>
    <color>ff0000ff</color>
    <scale>1.2</scale>
    <Icon>
      <href>http://maps.google.com/mapfiles/kml/paddle/blu-circle.png</href>
    </Icon>
  </IconStyle>
</Style>
"""

for _, row in df_merged.iterrows():
    point = Point(float(row['lon']), float(row['lat']))
    name = f"{round(row['lat'], 3)}, {round(row['lon'], 3)}"
    placemark = kml.Placemark(id=None, name=name, description=None, geometry=point)
    placemark.styleUrl = '#original_style'
    folder_original.append(placemark)
        
doc.append(folder_original)
k.append(doc)

kml_string = k.to_string(prettyprint=True)
kml_string = kml_string.replace('</Document>', f'{style_original}</Document>')

with open('assets/nasa_datas_sorted/merged_all.kml', 'w', encoding='utf-8') as f:
    f.write(kml_string)


In [None]:
df_grid = pd.read_csv('assets/turkey_filtered_grid.csv')
df_merged = pd.read_csv('assets/nasa_datas_sorted/merged_all.csv')
df_filtered = df_merged.merge(df_grid, on=['lat', 'lon'], how='inner')
df_filtered.to_csv('merged_all_filtered.csv', index=False)

# Data Processing

In [6]:
# Load your DataFrame (example)
df = pd.read_csv("merged_all_filtered.csv")

# Drop columns that contain "13" which is the average of the year column
df_cleaned = df.loc[:, ~df.columns.str.contains("13")]

# Save to a new file
df_cleaned.to_csv("merged_all_filtered_no13.csv", index=False)


## Normalization

In [7]:
# Load your DataFrame
df = pd.read_csv("merged_all_filtered_no13.csv")

# Base columns to normalize
base_columns = ['ALLSKY_SFC_SW_DWN','ALLSKY_KT', 'CLOUD_AMT', 'PRECTOTCORR']

normalized_columns = {
    'lat': df['lat'],
    'lon': df['lon']
}

# Normalize matching monthly columns
for col_base in base_columns:
    pattern = re.compile(rf'^{col_base}_\d{{6}}$')  # Matches e.g. ALLSKY_SFC_SW_DWN_202001
    matching_cols = [col for col in df.columns if pattern.match(col)]

    for col in matching_cols:
        min_val = df[col].min()
        max_val = df[col].max()
        if max_val - min_val != 0:
            normalized_columns[col] = ((df[col] - min_val) / (max_val - min_val)).round(5)
        else:
            normalized_columns[col] = df[col].round(5)

# Create the DataFrame
normalized_df = pd.DataFrame(normalized_columns)

# Save to CSV
normalized_df.to_csv("all_data_normalized.csv", index=False)


In [8]:
# Data analysis
features = ['ALLSKY_SFC_SW_DWN', 'ALLSKY_KT', 'CLOUD_AMT', 'PRECTOTCORR']
unique_counts = {}
df = pd.read_csv("all_data_normalized.csv")
for feature in features:
    # Get all monthly columns for this feature
    cols = [col for col in df.columns if col.startswith(feature)]
    
    # Concatenate all values into one Series
    combined = pd.concat([df[col] for col in cols])
    
    # Count unique values across all months
    unique_counts[feature] = combined.nunique()

# Show result
for feat, count in unique_counts.items():
    print(f"{feat} ‚Üí {count} unique values across months")


ALLSKY_SFC_SW_DWN ‚Üí 27156 unique values across months
ALLSKY_KT ‚Üí 3889 unique values across months
CLOUD_AMT ‚Üí 28685 unique values across months
PRECTOTCORR ‚Üí 22741 unique values across months


## Calculate Monthly scores for each point

In [9]:
df = pd.read_csv("all_data_normalized.csv")

# Define the suitability function 
def calculate_suitability(row, w0=0.8, w1=0.3, w2=-0.2, w3=-0.2):
    return (
        w0 * row['ALLSKY_SFC_SW_DWN'] +
        w1 * row['ALLSKY_KT'] +
        w2 * row['CLOUD_AMT'] +
        w3 * row['PRECTOTCORR']
    )

# Prepare a DataFrame to store the monthly scores
monthly_scores = df[['lat', 'lon']].copy()

# Loop through each month in 2020-2023 and calculate the suitability score 
for year in range(2020, 2024):
    for month in range(1, 13):
        ym = f"{year}{month:02d}"
        try:
            score = df.apply(lambda row: calculate_suitability({
                'ALLSKY_SFC_SW_DWN': row[f'ALLSKY_SFC_SW_DWN_{ym}'],
                'ALLSKY_KT': row[f'ALLSKY_KT_{ym}'],
                'CLOUD_AMT': row[f'CLOUD_AMT_{ym}'],
                'PRECTOTCORR': row[f'PRECTOTCORR_{ym}']
            }), axis=1)
            monthly_scores[f'score_{ym}'] = score.round(5)
        except KeyError:
            print(f"Missing data for month {ym} ‚Äî skipping.")

# Identify the score columns (those that start with "score_")
score_columns = [col for col in monthly_scores.columns if col.startswith("score_")]

# Calculate the average score across all 48 months for each point
monthly_scores['avg_score'] = monthly_scores[score_columns].mean(axis=1).round(5)

# Save the result
output_path = "monthly_suitability_scores_with_avg.csv"
monthly_scores.to_csv(output_path, index=False)

output_path


'monthly_suitability_scores_with_avg.csv'

## Extract point - score pairs


In [11]:
df = pd.read_csv("monthly_suitability_scores_with_avg.csv")

avg_score_df = monthly_scores[['lat', 'lon', 'avg_score']]

# Save to a new CSV file
output_path_avg_only = "lat_lon_score.csv" # This is the data that contains coordinates - score pairs and that will be passed to algos
avg_score_df.to_csv(output_path_avg_only, index=False)

output_path_avg_only


'lat_lon_score.csv'