<a href="https://colab.research.google.com/github/alfredamboka/FinalYearProject/blob/main/Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import ee
ee.Authenticate()
ee.Initialize()


To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://code.earthengine.google.com/client-auth?scopes=https%3A//www.googleapis.com/auth/earthengine%20https%3A//www.googleapis.com/auth/devstorage.full_control&request_id=ytXrv74wL5-CK4vYcYYQDnUN-tb95uG3gpvr3aWPPM0&tc=pXjlcLt30vNjj5tADDSiw6B3evlOn1RemzkwmUDP-wM&cc=roaLP18NI-HszlGqYQ8rOXEEfvAsR79WmOEwYe3Z5X4

The authorization workflow will generate a code, which you should paste in the box below.
Enter verification code: 4/1AfJohXko0f65WLQOwK6GANsYk2_ms0sZrcE9p-zcqtrDGSjxbvTbRFgK_yk

Successfully saved authorization token.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import ee
# ee.Authenticate()
ee.Initialize()

lulc100m = ee.Image('COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019')# Land Cover between 2015 to 2019 @ 100m
lulc500m = ee.ImageCollection("MODIS/061/MCD12Q1") #Landcover between 2001 to 2023 @ 500 m
dem = ee.ImageCollection("COPERNICUS/DEM/GLO30")  # provide DEM data @30m
era5 = ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR") # weather related datasets.
ndvi250m = ee.ImageCollection("MODIS/061/MOD13Q1") #from 2000 to date

poi = ee.Geometry.Point(20.51917583, -30.47471203) #For Namibia station.

point1 = ee.Geometry.Point([36.7773351, -2.392136016])
point2 = ee.Geometry.Point([36.91844621,-2.171580461])

# Create a FeatureCollection from the two point geometries
roi = ee.FeatureCollection([point1, point2])

In [4]:
#installation modules
!pip install rasterio
!pip install seaborn
try:
    import geemap
except ModuleNotFoundError:
    if 'google.colab' in str(get_ipython()):
        print('geemap not found, installing via pip in Google Colab...')
        !pip install geemap --quiet
        import geemap
    else:
        print('geemap not found, please install via conda in your environment')

Map = geemap.Map()


Collecting rasterio
  Downloading rasterio-1.3.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Collecting snuggs>=1.4.1 (from rasterio)
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Installing collected packages: snuggs, affine, rasterio
Successfully installed affine-2.4.0 rasterio-1.3.8 snuggs-1.4.7


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
import os

In [6]:
# @title Indices Dataset

# Define a function to calculate NDVI
def calculate_ndvi(image):
    return image.normalizedDifference(['B4', 'B3'])

# Define a function to calculate NDWI
def calculate_ndwi(image):
    return image.normalizedDifference(['B2', 'B4'])

# Define a function to calculate Aridity Index
def calculate_aridity_index(image):
    ndvi = calculate_ndvi(image)
    ndwi = calculate_ndwi(image)
    aridity_index = ndvi.subtract(ndwi)
    return aridity_index

# Define the Landsat 7 collection
collection = ee.ImageCollection("LANDSAT/LE07/C01/T1") \
    .filterBounds(poi) \
    .filterDate("2000-01-01", "2023-07-20") \
    .sort("CLOUD_COVER") \
    .first()

# Apply atmospheric correction (TOA reflectance)
atmospherically_corrected = ee.Algorithms.Landsat.TOA(collection)

# Calculate the indices
ndvi_image = calculate_ndvi(atmospherically_corrected)
ndwi_image = calculate_ndwi(atmospherically_corrected)
aridity_index_image = calculate_aridity_index(atmospherically_corrected)


In [7]:
# Add the layers to the map
Map.addLayer(ndvi_image, {'min': -1, 'max': 1, 'palette': ['66FFFF', '33CCFF', '0099FF']}, 'NDVI')
Map.addLayer(ndwi_image, {'min': -1, 'max': 1, 'palette': ['0000FF', 'FFFFFF','green']}, 'NDWI')
Map.addLayer(aridity_index_image, {'min': -1, 'max': 1, 'palette': ['FF00FF', 'FFFF00','blue', 'red','green']}, 'Aridity Index')


# Add map controls
Map.addLayerControl()

# Center the map on a specific location
Map.centerObject((poi), 10)
# Display the map
Map


Map(center=[-30.474712029999992, 20.51917583], controls=(WidgetControl(options=['position', 'transparent_bg'],…

In [8]:
# @title Datasets

# #Remote- SA actual Data
# df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/MainGeoSA.csv')
# df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
# namibdf =df

#Kajiado
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/mainKajiadoData.csv')
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
kajiado = df
df.head()


Unnamed: 0,Longitude,Latitude,Discharge,Station_ID,Date
0,36.777335,-2.392136,1.26,ST1,2000-01-01
1,36.918446,-2.17158,0.74,ST2,2000-01-01
2,36.777335,-2.392136,0.93,ST1,2000-01-02
3,36.918446,-2.17158,1.12,ST2,2000-01-02
4,36.777335,-2.392136,1.29,ST1,2000-01-03


In [9]:

from multiprocessing import Pool

# Load your Earth Engine image
dem = ee.Image('CGIAR/SRTM90_V4')

# Define a dictionary to cache results for unique coordinates
results_cache = {}

# Define a function to extract elevation and calculate slope
def add_elevation_and_slope(chunk):
    result = []
    for _, row in chunk.iterrows():
        # Convert the Pandas Series to a hashable tuple
        coord_key = tuple(row[['Longitude', 'Latitude']])

        # Check if the result for this coordinate is already cached
        if coord_key in results_cache:
            result.append(results_cache[coord_key])
        else:
            point = ee.Geometry.Point([row['Longitude'], row['Latitude']])
            elevation = dem.reduceRegion(ee.Reducer.mean(), point, 30).get('elevation').getInfo()
            slope = ee.Terrain.slope(dem).reduceRegion(ee.Reducer.mean(), point, 30).get('slope').getInfo()

            # Cache the results for this coordinate
            results_cache[coord_key] = {'Elevation': elevation, 'Slope': slope}

            result.append(results_cache[coord_key])
    return result

# Split your DataFrame into chunks to be processed in parallel
def split_dataframe(df, chunk_size):
    chunks = []
    for i in range(0, len(df), chunk_size):
        chunks.append(df.iloc[i:i+chunk_size])
    return chunks

# Define the number of workers (adjust this based on your CPU cores)
num_workers = 4

# Split the DataFrame into chunks
chunks = split_dataframe(df, len(df) // num_workers)

# Initialize a pool of workers
with Pool(num_workers) as pool:
    # Use the pool to apply the function in parallel to each chunk
    result_chunks = pool.map(add_elevation_and_slope, chunks)

# Flatten the result chunks
result_list = [item for sublist in result_chunks for item in sublist]

# Create a DataFrame from the results
result_df = pd.DataFrame(result_list)

# Now, your DataFrame contains the 'Elevation' and 'Slope' columns
print(result_df.head())


   Elevation     Slope
0       1331  0.691398
1       1375  1.802466
2       1331  0.691398
3       1375  1.802466
4       1331  0.691398


In [10]:
# df1 = pd.concat([namibdf, result_df], axis=1)
# df1.head()

df1 = pd.concat([kajiado, result_df], axis=1)
df1.head()

Unnamed: 0,Longitude,Latitude,Discharge,Station_ID,Date,Elevation,Slope
0,36.777335,-2.392136,1.26,ST1,2000-01-01,1331,0.691398
1,36.918446,-2.17158,0.74,ST2,2000-01-01,1375,1.802466
2,36.777335,-2.392136,0.93,ST1,2000-01-02,1331,0.691398
3,36.918446,-2.17158,1.12,ST2,2000-01-02,1375,1.802466
4,36.777335,-2.392136,1.29,ST1,2000-01-03,1331,0.691398


In [11]:
import multiprocessing
from datetime import datetime
# Convert the 'Date' column to datetime format
df1['Date'] = pd.to_datetime(df1['Date'])

# Extract the 'Year' column from the 'Date' column
df1['Year'] = df1['Date'].dt.year

# Define a function to extract NDVI for a single point and year
def extract_ndvi(args):
    year, lat, lon = args

    year = int(year)  # Convert year to an integer
    # Create a geometry point from the coordinates
    point = ee.Geometry.Point(lon, lat)

    # Define the MODIS NDVI collection
    ndvi250m = ee.ImageCollection("MODIS/061/MOD13Q1")

    # Define the date range for the given year
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    # Filter the collection for the given year and location
    ndvi_image = ndvi250m.filterDate(start_date, end_date).filterBounds(point).first()

    # Get the NDVI value at the point and convert it to a Python float
    ndvi_value = ee.Number(ndvi_image.reduceRegion(reducer=ee.Reducer.mean(), geometry=point, scale=250).get('NDVI')).getInfo()
    ndvi_value = ndvi_value * 0.0001

    return year, lat, lon, ndvi_value

# Get unique combinations of coordinates and years from the DataFrame
unique_coords_years = df1[['Year', 'Latitude', 'Longitude']].drop_duplicates()

# Define the number of processes to use (adjust as needed)
num_processes = multiprocessing.cpu_count()

# Use multiprocessing to apply the function to unique coordinates and years
with multiprocessing.Pool(processes=num_processes) as pool:
    ndvi_values = pool.map(extract_ndvi, unique_coords_years[['Year', 'Latitude', 'Longitude']].values)

# Create a DataFrame from the extracted NDVI values
ndvi_df = pd.DataFrame(ndvi_values, columns=['Year', 'Latitude', 'Longitude', 'NDVI'])

# Merge the NDVI DataFrame with the original DataFrame based on Year, Latitude, and Longitude
df1 = pd.merge(df1, ndvi_df, how='left', on=['Year', 'Latitude', 'Longitude'])
# Convert the NDVI values to the desired scale
# df1 = df1.drop(['NDVI_x', 'NDVI_y'], axis=1)


In [12]:
df1 = df1.drop(['Year'], axis=1)
print(df1.shape)
print(df1.head())

(16496, 8)
   Longitude  Latitude  Discharge  Station_ID       Date  Elevation     Slope  \
0  36.777335 -2.392136       1.26         ST1 2000-01-01       1331  0.691398   
1  36.918446 -2.171580       0.74         ST2 2000-01-01       1375  1.802466   
2  36.777335 -2.392136       0.93         ST1 2000-01-02       1331  0.691398   
3  36.918446 -2.171580       1.12         ST2 2000-01-02       1375  1.802466   
4  36.777335 -2.392136       1.29         ST1 2000-01-03       1331  0.691398   

     NDVI  
0  0.4052  
1  0.3921  
2  0.4052  
3  0.3921  
4  0.4052  


In [13]:
import multiprocessing
from datetime import datetime
import pandas as pd
import ee

# Initialize the Earth Engine
ee.Initialize()

# Convert the 'Date' column to datetime format
df1['Date'] = pd.to_datetime(df1['Date'])

# Extract the 'Year' column from the 'Date' column
df1['Year'] = df1['Date'].dt.year

def extract_indices(args):
    year, lat, lon = args

    year = int(year)  # Convert year to an integer
    # Create a geometry point from the coordinates
    point = ee.Geometry.Point(lon, lat)

    if year >= 2013:
        # Landsat 8 for 2013 and later years
        landsat_collection = ee.ImageCollection('LANDSAT/LC08/C01/T1_TOA').filterDate(str(year)+'-01-01', str(year)+'-12-31').filterBounds(point)
    elif year >= 2000:
        # Landsat 7 for 2000 to 2012
        landsat_collection = ee.ImageCollection('LANDSAT/LE07/C01/T1_TOA').filterDate(str(year)+'-01-01', str(year)+'-12-31').filterBounds(point)
    else:
        # Landsat 5 for years before 2000
        landsat_collection = ee.ImageCollection('LANDSAT/LT05/C01/T1_TOA').filterDate(str(year)+'-01-01', str(year)+'-12-31').filterBounds(point)

    # Check if there are images in the collection for the given year
    image_count = landsat_collection.size().getInfo()

    if image_count > 0:
        # Calculate NDVI30m
        ndvi_image = landsat_collection.select(['B4', 'B3']).reduce(ee.Reducer.mean())
        ndvi_value = ndvi_image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=30  # Resolution in meters
        ).get('B4_mean').getInfo()

        # Calculate NDWI
        ndwi_image = landsat_collection.select(['B3', 'B5']).reduce(ee.Reducer.mean())
        ndwi_value = ndwi_image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=30  # Resolution in meters
        ).get('B3_mean').getInfo()

        # Calculate Aridity index
        ndii_image = landsat_collection.select(['B5', 'B4']).reduce(ee.Reducer.mean())
        aridity_value = ndii_image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=30  # Resolution in meters
        ).get('B5_mean').getInfo()

    else:
        # Set default values if no Landsat image is available for the year
        ndvi_value = None
        ndwi_value = None
        aridity_value = None

    return year, lat, lon, ndvi_value, ndwi_value, aridity_value

# Get unique combinations of coordinates and years from the DataFrame
unique_coords_years = df1[['Year', 'Latitude', 'Longitude']].drop_duplicates()

# Define the number of processes to use (adjust as needed)
num_processes = multiprocessing.cpu_count()

# Use multiprocessing to apply the function to unique coordinates and years
with multiprocessing.Pool(processes=num_processes) as pool:
    indices_values = pool.map(extract_indices, unique_coords_years[['Year', 'Latitude', 'Longitude']].values)

# Create a DataFrame from the extracted indices values
indices_df = pd.DataFrame(indices_values, columns=['Year', 'Latitude', 'Longitude', 'NDVI30m', 'NDWI', 'Aridity'])

# Merge the indices DataFrame with the original DataFrame based on Year, Latitude, and Longitude
df1 = pd.merge(df1, indices_df, how='left', on=['Year', 'Latitude', 'Longitude'])
df1 = df1.drop(['Year'], axis=1)

In [14]:
print(df1.head(n=5))
df1.shape

   Longitude  Latitude  Discharge  Station_ID       Date  Elevation     Slope  \
0  36.777335 -2.392136       1.26         ST1 2000-01-01       1331  0.691398   
1  36.918446 -2.171580       0.74         ST2 2000-01-01       1375  1.802466   
2  36.777335 -2.392136       0.93         ST1 2000-01-02       1331  0.691398   
3  36.918446 -2.171580       1.12         ST2 2000-01-02       1375  1.802466   
4  36.777335 -2.392136       1.29         ST1 2000-01-03       1331  0.691398   

     NDVI   NDVI30m     NDWI   Aridity  
0  0.4052  0.405517  0.24050  0.345799  
1  0.3921  0.347359  0.19883  0.303815  
2  0.4052  0.405517  0.24050  0.345799  
3  0.3921  0.347359  0.19883  0.303815  
4  0.4052  0.405517  0.24050  0.345799  


(16496, 11)

In [15]:
print(df1.head())
print(df1.tail())
print(df1.shape)

   Longitude  Latitude  Discharge  Station_ID       Date  Elevation     Slope  \
0  36.777335 -2.392136       1.26         ST1 2000-01-01       1331  0.691398   
1  36.918446 -2.171580       0.74         ST2 2000-01-01       1375  1.802466   
2  36.777335 -2.392136       0.93         ST1 2000-01-02       1331  0.691398   
3  36.918446 -2.171580       1.12         ST2 2000-01-02       1375  1.802466   
4  36.777335 -2.392136       1.29         ST1 2000-01-03       1331  0.691398   

     NDVI   NDVI30m     NDWI   Aridity  
0  0.4052  0.405517  0.24050  0.345799  
1  0.3921  0.347359  0.19883  0.303815  
2  0.4052  0.405517  0.24050  0.345799  
3  0.3921  0.347359  0.19883  0.303815  
4  0.4052  0.405517  0.24050  0.345799  
       Longitude  Latitude  Discharge  Station_ID       Date  Elevation  \
16491  36.918446 -2.171580       0.32         ST2 2023-07-29       1375   
16492  36.777335 -2.392136       0.41         ST1 2023-07-30       1331   
16493  36.918446 -2.171580       0.12     

In [None]:
import multiprocessing
import pandas as pd
import ee

# Initialize Earth Engine
ee.Initialize()

# Function to extract ERA5 data for a specific date
def extract_era5(args):
    date, lat, lon = args

    # Create a geometry point from the coordinates
    point = ee.Geometry.Point(lon, lat)

    # Define the ERA5 collection
    era5 = ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR")

    # Convert the date to an Earth Engine date object
    ee_date = ee.Date(date)

    # Filter the collection for the given date and location
    era5_image = era5.filterDate(ee_date, ee_date.advance(1, 'day')).filterBounds(point).first()

    if era5_image is not None:
        # Select the desired bands
        selected_bands = ["surface_net_thermal_radiation_sum", "potential_evaporation_sum", "surface_pressure",
                          "total_precipitation_sum", "temperature_2m_min", "temperature_2m_max",
                          "leaf_area_index_high_vegetation", "leaf_area_index_low_vegetation"]

        # Get the values for the selected bands at the point and convert them to Python floats
        era5_values = ee.Dictionary(era5_image.select(selected_bands).reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=1000  # Adjust the scale as needed
        )).getInfo()

        era5_values['Date'] = date  # Add the date to the extracted data

        return era5_values
    else:
        return None

# Get unique combinations of coordinates and years from the DataFrame
unique_coords_years = df1[['Date', 'Latitude', 'Longitude']].drop_duplicates()

# Define the number of processes to use (adjust as needed)
num_processes = multiprocessing.cpu_count()

# Use multiprocessing to apply the function to unique coordinates, years, and dates
with multiprocessing.Pool(processes=num_processes) as pool:
    era5_values = pool.map(extract_era5, [(date, lat, lon) for date, lat, lon in
                                         zip(unique_coords_years['Date'], unique_coords_years['Latitude'], unique_coords_years['Longitude'])])

# Filter out None values (dates with no data)
era5_values = [x for x in era5_values if x is not None]

# Create a DataFrame from the extracted ERA5 values
era5_df = pd.DataFrame(era5_values)

# Merge the ERA5 DataFrame with the original DataFrame on the 'Date' column
df1 = pd.merge(df1, era5_df, on='Date', how='left')

# Display the resulting DataFrame
print(df1)


       Longitude  Latitude  Discharge  Station_ID       Date  Elevation  \
0      36.777335 -2.392136       1.26         ST1 2000-01-01       1331   
1      36.777335 -2.392136       1.26         ST1 2000-01-01       1331   
2      36.918446 -2.171580       0.74         ST2 2000-01-01       1375   
3      36.918446 -2.171580       0.74         ST2 2000-01-01       1375   
4      36.777335 -2.392136       0.93         ST1 2000-01-02       1331   
...          ...       ...        ...         ...        ...        ...   
32987  36.918446 -2.171580       0.12         ST2 2023-07-30       1375   
32988  36.777335 -2.392136       0.30         ST1 2023-07-31       1331   
32989  36.777335 -2.392136       0.30         ST1 2023-07-31       1331   
32990  36.918446 -2.171580       0.20         ST2 2023-07-31       1375   
32991  36.918446 -2.171580       0.20         ST2 2023-07-31       1375   

          Slope    NDVI   NDVI30m     NDWI   Aridity  \
0      0.691398  0.4052  0.405517  0.24050 

In [22]:
df1.head(n=400)

Unnamed: 0,Longitude,Latitude,Discharge,Station_ID,Date,Elevation,Slope,NDVI,NDVI30m,NDWI,Aridity,leaf_area_index_high_vegetation,leaf_area_index_low_vegetation,potential_evaporation_sum,surface_net_thermal_radiation_sum,surface_pressure,temperature_2m_max,temperature_2m_min,total_precipitation_sum
0,36.777335,-2.392136,1.26,ST1,2000-01-01,1331,0.691398,0.4052,0.405517,0.24050,0.345799,3.934321,1.272563,-0.016339,-5985708,85803.772786,299.343536,289.442780,0.000360
1,36.777335,-2.392136,1.26,ST1,2000-01-01,1331,0.691398,0.4052,0.405517,0.24050,0.345799,4.926865,1.557821,-0.009311,-6610906,85278.481120,299.825958,289.735413,0.000615
2,36.918446,-2.171580,0.74,ST2,2000-01-01,1375,1.802466,0.3921,0.347359,0.19883,0.303815,3.934321,1.272563,-0.016339,-5985708,85803.772786,299.343536,289.442780,0.000360
3,36.918446,-2.171580,0.74,ST2,2000-01-01,1375,1.802466,0.3921,0.347359,0.19883,0.303815,4.926865,1.557821,-0.009311,-6610906,85278.481120,299.825958,289.735413,0.000615
4,36.777335,-2.392136,0.93,ST1,2000-01-02,1331,0.691398,0.4052,0.405517,0.24050,0.345799,3.937373,1.275747,-0.015242,-5989708,85774.884928,298.812515,288.775406,0.000324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,36.918446,-2.171580,0.63,ST2,2000-04-08,1375,1.802466,0.3921,0.347359,0.19883,0.303815,4.931259,1.618169,-0.009035,-4522228,85435.016276,300.190765,289.756348,0.002353
396,36.777335,-2.392136,1.31,ST1,2000-04-09,1331,0.691398,0.4052,0.405517,0.24050,0.345799,3.996348,1.323990,-0.011986,-3811134,85965.745280,298.860123,290.065231,0.003429
397,36.777335,-2.392136,1.31,ST1,2000-04-09,1331,0.691398,0.4052,0.405517,0.24050,0.345799,4.931508,1.621094,-0.007896,-4187028,85439.161947,299.250748,289.973434,0.002076
398,36.918446,-2.171580,1.15,ST2,2000-04-09,1375,1.802466,0.3921,0.347359,0.19883,0.303815,3.996348,1.323990,-0.011986,-3811134,85965.745280,298.860123,290.065231,0.003429


In [23]:


output_directory = '/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/'
#output_file = 'GeoCompleteSAData.csv'
output_file = 'KajiadoSuper.csv'

# Construct the full path to the output file
output_path = os.path.join(output_directory, output_file)

# Save the DataFrame to a CSV file
df1.to_csv(output_path, index=False)  # Set index=False to avoid saving the index as a column



In [120]:
# @title Add LULC
# #for Namibia station
# data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/GeoCompleteSAData.csv')


#For Kajiado -main study area
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/KajiadoSuper.csv')


In [121]:
# Create a dictionary to map old column names to new column names
column_mapping = {'leaf_area_index_high_vegetation': 'LAI_High', 'leaf_area_index_low_vegetation': 'LAI_Low',
                  'potential_evaporation_sum': 'PET', 'surface_net_thermal_radiation_sum':'radiation',
                  'surface_pressure':'Vpressure','temperature_2m_max':'Tmax','temperature_2m_min':'Tmin',
                  'total_precipitation_sum':'Precipitation'}

# Use the rename() method to rename columns
data.rename(columns=column_mapping, inplace=True)

# Convert temperature from Kelvin to degrees Celsius
data['Tmax(°C)'] = data['Tmax'] - 273.15
data['Tmin(°C)'] = data['Tmin'] - 273.15

# Convert precipitation from meters to millimeters
data['Precipitation(mm)'] = data['Precipitation'] * 1000

# Drop the original columns if you don't need them anymore
data.drop(['Tmax', 'Tmin', 'Precipitation'], axis=1, inplace=True)


data.head()

Unnamed: 0,Longitude,Latitude,Discharge,Station_ID,Date,Elevation,Slope,NDVI,NDVI30m,NDWI,Aridity,LAI_High,LAI_Low,PET,radiation,Vpressure,Tmax(°C),Tmin(°C),Precipitation(mm)
0,36.777335,-2.392136,1.26,ST1,2000-01-01,1331,0.691398,0.4052,0.405517,0.2405,0.345799,3.934321,1.272563,-0.016339,-5985708,85803.772786,26.193536,16.29278,0.359535
1,36.777335,-2.392136,1.26,ST1,2000-01-01,1331,0.691398,0.4052,0.405517,0.2405,0.345799,4.926865,1.557821,-0.009311,-6610906,85278.48112,26.675958,16.585413,0.614583
2,36.918446,-2.17158,0.74,ST2,2000-01-01,1375,1.802466,0.3921,0.347359,0.19883,0.303815,3.934321,1.272563,-0.016339,-5985708,85803.772786,26.193536,16.29278,0.359535
3,36.918446,-2.17158,0.74,ST2,2000-01-01,1375,1.802466,0.3921,0.347359,0.19883,0.303815,4.926865,1.557821,-0.009311,-6610906,85278.48112,26.675958,16.585413,0.614583
4,36.777335,-2.392136,0.93,ST1,2000-01-02,1331,0.691398,0.4052,0.405517,0.2405,0.345799,3.937373,1.275747,-0.015242,-5989708,85774.884928,25.662515,15.625406,0.323553


In [122]:
# @title Final Dataset
output_directory = '/content/drive/MyDrive/Colab Notebooks/Oningo/Dataset/'
# output_file = 'FinalGeoCompleteSAData.csv'
output_file = 'FinaKajiadoSuper.csv'

# Construct the full path to the output file
output_path = os.path.join(output_directory, output_file)

# Save the DataFrame to a CSV file
data.to_csv(output_path, index=False)


In [123]:
# import pandas as pd
# import ee
# import multiprocessing

# # Initialize Earth Engine
# ee.Initialize()

# # Define your MODIS land cover collection
# lulc500m = ee.ImageCollection("MODIS/061/MCD12Q1")

# # Create a cache dictionary to store fetched values
# cache = {}

# # Define a function to get the land cover value for a given date and location
# def get_lulc_value(row):
#     # Extract the year from the Date column in your DataFrame
#     year = row['Date'].year

#     # Create a unique cache key for the year and coordinates
#     cache_key = (year, row['Latitude'], row['Longitude'])

#     # Check if the value is already in the cache
#     if cache_key in cache:
#         return cache[cache_key]

#     # Create an Earth Engine date object for the specified year
#     date = ee.Date.fromYMD(year, 1, 1)

#     # Filter the MODIS land cover collection for the specified year
#     lulc_year = lulc500m.filterDate(date, date.advance(1, 'year')).first()

#     # Check if a valid image exists for the specified year
#     if lulc_year.getInfo():
#         # Get the land cover value for the given Latitude and Longitude
#         point = ee.Geometry.Point(row['Longitude'], row['Latitude'])
#         lulc_value = lulc_year.reduceRegion(
#             reducer=ee.Reducer.first(),
#             geometry=point,
#             scale=500  # Set the scale to 500 meters
#         ).get('LC_Type1').getInfo()

#         # Cache the result
#         cache[cache_key] = (year, row['Latitude'], row['Longitude'], lulc_value)

#         return (year, row['Latitude'], row['Longitude'], lulc_value)  # Return year, latitude, longitude, and lulc_value as a tuple
#     else:
#         return None  # Return None if no image is found for the year

# # Assuming 'data' is your DataFrame with Date, Latitude, and Longitude columns
# data['Date'] = pd.to_datetime(data['Date'])  # Ensure Date column is in datetime format

# # Convert the DataFrame to a list of dictionaries
# data_dict_list = data.to_dict(orient='records')

# # Create a pool of worker processes
# num_processes = multiprocessing.cpu_count()
# pool = multiprocessing.Pool(num_processes)

# # Map the get_lulc_value function to each row in parallel
# results = pool.map(get_lulc_value, data_dict_list)

# # Close the pool of worker processes
# pool.close()
# pool.join()

# # Filter out None values
# results = [result for result in results if result is not None]

# # Create a new DataFrame with the results
# result_df = pd.DataFrame(results, columns=['Year', 'Latitude', 'Longitude', 'lulc'])

# # Print the updated DataFrame with the 'lulc' column
# print(result_df)


In [124]:
# @title END!
