In [1]:
import os
import pandas as pd
import xarray as xr
import numpy as np
np.warnings.filterwarnings('ignore')
def round_to_nearest_025(value):
    return np.round(value / 0.25) * 0.25

def area_preprocessing(file_path):
    area_ds = xr.open_dataset(file_path, engine='netcdf4')
    area_ds['lat'] = xr.apply_ufunc(round_to_nearest_025, area_ds['lat'])
    area_ds['lon'] = xr.apply_ufunc(round_to_nearest_025, area_ds['lon'])
    filtered_data = area_ds.squeeze('time')
    filtered_data = filtered_data.sel(lat=slice(52, 33), lon=slice(-10, 50))
    filtered_data = filtered_data[['lat_bounds',
                                   'lon_bounds',
                                   'processed_flag',
                                   'lccs_class',
                                   'current_pixel_state',
                                   'observation_count',
                                   'change_count'
                                  ]]
    filtered_data = filtered_data.drop_duplicates(dim=['lat', 'lon'])
    area_df = filtered_data.to_dataframe().reset_index()
    area_df = area_df.drop(['bounds'], axis=1)

    return area_df


folder_path = '..../fireproject/dataset_landcover'  # Replace with your folder path
# List of file paths
file_paths = []
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        file_paths.append(file_path)

# Creating an empty DataFrame to store the results
combined_df = pd.DataFrame()

# Iterating over the file paths and applying the area_preprocessing function
for file_path in file_paths:
    df = area_preprocessing(file_path)
    combined_df = combined_df.append(df)

# Resetting the index of the combined DataFrame
combined_df = combined_df.reset_index(drop=True)

# Print the combined DataFrame
combined_df

combined_df['time'] = pd.to_datetime(combined_df['time']).dt.year

# Sorting the DataFrame by the 'date' column in ascending order
combined_df_sorted = combined_df.sort_values('time', ascending=True)

# Resetting the index of the sorted DataFrame
combined_df_sorted = combined_df_sorted.reset_index(drop=True)

# Printing the sorted DataFrame
combined_df_sorted

Unnamed: 0,lat,lon,lat_bounds,lon_bounds,processed_flag,lccs_class,current_pixel_state,observation_count,change_count,time
0,52.00,-10.00,52.125000,-10.125000,1.0,130,1.0,119,0,2001
1,39.25,29.50,39.375000,29.375000,1.0,100,1.0,333,0,2001
2,39.25,29.75,39.375000,29.625000,1.0,100,1.0,302,1,2001
3,39.25,30.00,39.375000,29.875000,1.0,70,1.0,276,0,2001
4,39.25,30.25,39.375000,30.125000,1.0,10,1.0,332,0,2001
...,...,...,...,...,...,...,...,...,...,...
742275,39.25,29.50,39.375000,29.375000,1.0,100,1.0,333,0,2020
742276,39.25,29.25,39.375000,29.125000,1.0,70,1.0,250,0,2020
742277,39.25,29.00,39.375000,28.875000,1.0,70,1.0,271,0,2020
742278,39.25,42.00,39.375000,41.875000,1.0,10,1.0,325,0,2020


In [2]:
combined_df_sorted.to_csv('satellite_landcover_dataset.csv', index=False)

In [3]:
combined_df_sorted.tail(10)

Unnamed: 0,lat,lon,lat_bounds,lon_bounds,processed_flag,lccs_class,current_pixel_state,observation_count,change_count,time
742270,39.25,30.75,39.375,30.625,1.0,100,1.0,310,0,2020
742271,39.25,30.5,39.375,30.375,1.0,70,1.0,277,1,2020
742272,39.25,30.25,39.375,30.125,1.0,10,1.0,332,0,2020
742273,39.25,30.0,39.375,29.875,1.0,70,1.0,276,0,2020
742274,39.25,29.75,39.375,29.625,1.0,100,1.0,302,1,2020
742275,39.25,29.5,39.375,29.375,1.0,100,1.0,333,0,2020
742276,39.25,29.25,39.375,29.125,1.0,70,1.0,250,0,2020
742277,39.25,29.0,39.375,28.875,1.0,70,1.0,271,0,2020
742278,39.25,42.0,39.375,41.875,1.0,10,1.0,325,0,2020
742279,37.75,19.75,37.872222,19.627778,1.0,210,0.0,0,0,2020
