# Climate Data Preprocessing

## Download data

In [2]:
import os
import glob

def unpack_data():

    data_dir = '../data/climate'

# Find all .tar.gz files in the data directory
    tar_files = glob.glob(os.path.join(data_dir, '*.zip'))

# Unpack and remove each .tar.gz file
    for tar_file in tar_files:
        os.system(f'tar -xf {tar_file} -C {data_dir}')
        os.remove(tar_file)

In [3]:
import cdsapi

def data_download(year):
    
    c = cdsapi.Client()
    c.retrieve(
        'insitu-gridded-observations-global-and-regional',
        {
            'origin': 'cru',
            'region': 'global',
            'variable': [
                'precipitation', 'temperature',
            ],
            'statistic': [
                'maximum', 'mean',
            ],
            'time_aggregation': 'monthly',
            'horizontal_aggregation': '0_5_x_0_5',
            'year': f'{year}',
            'version': 'v4.03',
            'format': 'zip',
        },
        f'../data/climate/climate_{year}.zip')
       
    return unpack_data()

In [4]:
start_year = 2001
end_year = 2019

for i in range(start_year, end_year + 1):
    data_download(i)

2023-06-08 10:05:50,623 INFO Welcome to the CDS
2023-06-08 10:05:50,624 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/insitu-gridded-observations-global-and-regional
2023-06-08 10:05:50,847 INFO Request is completed
2023-06-08 10:05:50,849 INFO Downloading https://download-0017.copernicus-climate.eu/cache-compute-0017/cache/data8/dataset-insitu-gridded-observations-global-and-regional-212601fb-200a-406a-8813-133972b4a848.zip to ../data/climate/climate_2001.zip (3.9M)
2023-06-08 10:05:51,407 INFO Download rate 7M/s     
2023-06-08 10:05:51,688 INFO Welcome to the CDS
2023-06-08 10:05:51,689 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/insitu-gridded-observations-global-and-regional
2023-06-08 10:05:51,897 INFO Request is completed
2023-06-08 10:05:51,898 INFO Downloading https://download-0006-clone.copernicus-climate.eu/cache-compute-0006/cache/data8/dataset-insitu-gridded-observations-global-and-regional-3de20bff-d7c6-48ff-adba

## Select data

In [5]:
import xarray as xr
import pandas as pd

lat_low = 32.75
lat_high = 52
lon_low = -10.25
lon_high = 50

def climate_preprocessing(file_path):
    frame = xr.open_dataset(file_path, engine='netcdf4').to_dataframe()
    frame.reset_index(inplace=True)
    frame['month'] = frame['time'].dt.month
    frame['year'] = frame['time'].dt.year
    frame.drop('time', inplace=True, axis=1)  
    frame = frame[(frame['lat']>=lat_low) & (frame['lat']<=lat_high) & (frame['lon']>=lon_low) & (frame['lon']<=lon_high)]
    return frame

In [6]:
def merge_years(climate_var, data_dir):
    dfs = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.nc') and climate_var in file_name:
            file_path = os.path.join(data_dir, file_name)
            df = climate_preprocessing(file_path)
            dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

In [7]:
data_dir = '../data/climate'

df_mean_temperature = merge_years('CRU_mean_temperature', data_dir)
df_maximum_temperature = merge_years('CRU_maximum_temperature', data_dir)
df_total_precipitation = merge_years('CRU_total_precipitation', data_dir)

df_all = df_mean_temperature.copy()
df_all.drop('tas', inplace=True, axis=1)
df_all = pd.merge(df_all, df_mean_temperature, on=['lon','lat','month','year'])
df_all = pd.merge(df_all, df_maximum_temperature, on=['lon','lat','month','year'])
df_all = pd.merge(df_all, df_total_precipitation, on=['lon','lat','month','year'])

df_all

Unnamed: 0,lon,lat,month,year,tas,tasmax,pr
0,-10.25,32.75,1,2007,,,
1,-10.25,33.25,1,2007,,,
2,-10.25,33.75,1,2007,,,
3,-10.25,34.25,1,2007,,,
4,-10.25,34.75,1,2007,,,
...,...,...,...,...,...,...,...
1075927,49.75,49.75,12,2015,-2.0,0.9,37.100002
1075928,49.75,50.25,12,2015,-2.4,0.5,40.799999
1075929,49.75,50.75,12,2015,-2.2,0.7,44.799999
1075930,49.75,51.25,12,2015,-2.5,0.2,50.200001


## Create columns by month

In [8]:
pd.options.mode.chained_assignment = None
pd.options.display.float_format = "{:,.2f}".format

months_number = 4
first_month = 5

data_per_month = []

for i in range(first_month,first_month+months_number):
    frame = df_all[(df_all['month']==i)]
    frame.drop('month', inplace=True, axis=1)
    data_per_month.append(frame)
    
df_all_months = data_per_month[0].copy()

for i in range(0,months_number):
    df_all_months = pd.merge(df_all_months, data_per_month[i], on=['lon','lat','year'], suffixes=('', f'_{i+5}'))

df_all_months.drop(['tas','tasmax','pr'], inplace=True, axis=1)
df_all_months

Unnamed: 0,lon,lat,year,tas_5,tasmax_5,pr_5,tas_6,tasmax_6,pr_6,tas_7,tasmax_7,pr_7,tas_8,tasmax_8,pr_8
0,-10.25,32.75,2007,,,,,,,,,,,,
1,-10.25,33.25,2007,,,,,,,,,,,,
2,-10.25,33.75,2007,,,,,,,,,,,,
3,-10.25,34.25,2007,,,,,,,,,,,,
4,-10.25,34.75,2007,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89656,49.75,49.75,2015,18.60,24.40,21.40,26.30,33.20,15.90,24.80,31.20,18.00,22.90,30.40,9.70
89657,49.75,50.25,2015,18.40,24.20,23.70,26.10,33.00,19.40,24.40,30.70,21.20,22.60,29.90,11.60
89658,49.75,50.75,2015,18.40,24.10,28.80,25.70,32.40,22.00,23.70,29.90,25.50,21.80,28.90,13.40
89659,49.75,51.25,2015,17.80,23.60,32.70,24.90,31.60,25.50,22.60,28.60,32.00,20.80,27.60,15.90


## Increase spatial resolution

In [9]:
df_all_months['constant'] = 0.25
df_all_bis = df_all_months.copy()
df_all_bis['lon'] = df_all_bis['lon'] + df_all_bis['constant']
df_all_tris = df_all_months.copy()
df_all_tris['lat'] = df_all_tris['lat'] + df_all_tris['constant']
df_all_quatris = df_all_months.copy()
df_all_quatris['lon'] = df_all_quatris['lon'] + df_all_tris['constant']
df_all_quatris['lat'] = df_all_quatris['lat'] + df_all_tris['constant']

dfs = [df_all_months, df_all_bis, df_all_tris, df_all_quatris]

df_final = pd.concat(dfs, ignore_index=True)
df_final = df_final.drop(['constant'], axis=1)
df_final = df_final[df_final.lon != lon_low]
df_final = df_final[df_final.lat != lat_low]
df_final = df_final.sort_values(['year','lon','lat'])
df_final = df_final.reset_index()
df_final = df_final.drop(['index'], axis=1)
df_final

Unnamed: 0,lon,lat,year,tas_5,tasmax_5,pr_5,tas_6,tasmax_6,pr_6,tas_7,tasmax_7,pr_7,tas_8,tasmax_8,pr_8
0,-10.00,33.00,2001,,,,,,,,,,,,
1,-10.00,33.25,2001,,,,,,,,,,,,
2,-10.00,33.50,2001,,,,,,,,,,,,
3,-10.00,33.75,2001,,,,,,,,,,,,
4,-10.00,34.00,2001,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352578,50.00,51.00,2019,19.90,26.30,24.60,24.00,31.30,19.60,23.60,30.20,35.00,20.80,27.30,18.10
352579,50.00,51.25,2019,19.20,25.60,27.90,22.90,30.10,21.70,22.80,29.20,37.50,19.90,26.10,24.50
352580,50.00,51.50,2019,19.20,25.60,27.90,22.90,30.10,21.70,22.80,29.20,37.50,19.90,26.10,24.50
352581,50.00,51.75,2019,18.80,25.20,29.50,22.40,29.30,22.80,22.40,28.60,40.50,19.60,25.50,30.20


## Save data in CSV file

In [10]:
df_final.to_csv('../data/climate.csv')

In [11]:
print('test')

test
