In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))

In [10]:
import json
import numpy as np
import pandas as pd

In [8]:
folder_path = '../data/raw/climate'

file_names = os.listdir(folder_path)


In [15]:
dfs = []
for file in file_names:
    loc = file.split('_')[2][:-4]
    df = pd.read_csv(os.path.join(folder_path, file))
    df['Location'] = loc
    dfs.append(df)

df = pd.concat(dfs)

In [16]:
df.drop(columns=['Unnamed: 0','0','weather_code'], inplace=True)

In [18]:
columns_to_normalize = [
    'temperature_2m_max',
    'temperature_2m_min',
    'precipitation_sum',
    'rain_sum',
    'shortwave_radiation_sum',
    'et0_fao_evapotranspiration',
    'sunshine_duration',
    'precipitation_hours',
    'wind_speed_10m_max'
]


In [19]:
df_normalized = df.copy()

In [24]:
from sklearn.preprocessing import RobustScaler
from joblib import dump

In [22]:
scaler = RobustScaler()
df_normalized[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In [25]:
dump(scaler, 'robust_scaler.joblib')

['robust_scaler.joblib']

In [27]:
df_normalized.to_csv('../data/processed/all_locations_normalized.csv', index=False)

In [28]:
import src.load as load

In [30]:
import src.load as load

meta, y, mappings, reverse_mappings = load.load_data('../data/processed/')



In [40]:
df_normalized.date = pd.to_datetime(df_normalized.date).dt.tz_localize(None)

In [41]:
def get_climate_series(row):
    """returns the climate series from df_normalized for 100 days after transplant date. returns values from columns to be normalized"""
    transplant_date = row['TransplantDate']
    location = row['ProducerCode']
    
    # Filter the dataframe for the specific location
    location_data = df_normalized[df_normalized['Location'] == location]
    
    # Convert transplant_date to datetime
    transplant_date = pd.to_datetime(transplant_date)
    
    # Filter the data for 100 days after the transplant date
    climate_series = location_data[(location_data['date'] >= transplant_date) & 
                                   (location_data['date'] < transplant_date + pd.Timedelta(days=100))]
    
    # Return the values from columns to be normalized
    return climate_series[columns_to_normalize].values.tolist()
