In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
import pandas as pd


In [3]:
from config import DATA_DIR

hazelnut_file = PROJECT_ROOT / DATA_DIR / "processed" / "hazelnut_price.csv"
weather_file = PROJECT_ROOT / DATA_DIR / "processed" / "weather_daily_1999_2025.csv"
currency_file = PROJECT_ROOT / DATA_DIR / "processed" / "usd_tl.csv"
trends_file = PROJECT_ROOT / DATA_DIR / "processed" / "trends_daily.csv"
features_file = PROJECT_ROOT / DATA_DIR / "processed" / "features.csv"
ml_file = PROJECT_ROOT / DATA_DIR / "processed" / "features_for_ml.csv"

In [4]:
# Load processed datasets
hazelnut = pd.read_csv(hazelnut_file, parse_dates=["Date"])
weather = pd.read_csv(weather_file, parse_dates=["date"])
currency = pd.read_csv(currency_file, parse_dates=["Date"])
trends = pd.read_csv(trends_file, parse_dates=["date"])

In [5]:
hazelnut.describe()

Unnamed: 0,Date,Value,iValue,rolling_14d,AdjustedValue,iAdjusted,rolling_14d_adjusted,USD_TL
count,9862,3777.0,8955.0,9128.0,3777.0,8955.0,9128.0,9862.0
mean,2012-07-01 12:00:00,16.579554,22.9271,22.525604,2.727991,2.786247,2.784686,6.343277
min,1999-01-01 00:00:00,0.536056,0.536056,0.587859,0.690343,0.690343,0.730755,0.31522
25%,2005-10-01 06:00:00,2.42,2.907,2.943941,1.797479,1.955942,1.946889,1.414143
50%,2012-07-01 12:00:00,4.62,5.11,5.321429,2.665042,2.721972,2.721873,1.798403
75%,2019-04-01 18:00:00,12.75,16.653333,16.586964,3.24819,3.301866,3.304607,5.698383
max,2025-12-31 00:00:00,328.31,328.31,323.232619,7.89556,7.89556,7.769791,42.9395
std,,37.820052,46.624648,45.760564,1.228727,1.21468,1.205156,9.955254


In [6]:
weather.describe()

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,wind_speed_10m_max,et0_fao_evapotranspiration
count,9862,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0
mean,2012-07-01 12:00:00,17.466234,10.938674,17.583371,9.815747,3.393348,3.091199,13.893977,2.444288
min,1999-01-01 00:00:00,-2.8,-10.8,-8.3,-15.4,0.0,0.0,4.3,0.18
25%,2005-10-01 06:00:00,12.4,5.7,10.4,2.8,0.0,0.0,10.7,1.3
50%,2012-07-01 12:00:00,18.0,10.8,17.8,9.2,0.3,0.3,12.85,2.25
75%,2019-04-01 18:00:00,23.2,16.8,25.5,17.5,3.9,3.5,16.0,3.5
max,2025-12-31 00:00:00,35.8,25.5,40.7,30.7,78.7,78.7,53.7,6.74
std,,6.950824,6.91028,9.366634,8.99672,6.416817,6.04154,4.755491,1.343506


In [7]:
currency.describe()

Unnamed: 0,Date,USD_TL
count,9862,9862.0
mean,2012-07-01 12:00:00,6.343277
min,1999-01-01 00:00:00,0.31522
25%,2005-10-01 06:00:00,1.414143
50%,2012-07-01 12:00:00,1.798403
75%,2019-04-01 18:00:00,5.698383
max,2025-12-31 00:00:00,42.9395
std,,9.955254


In [8]:
trends.describe()

Unnamed: 0,date,trend
count,5510,5510.0
mean,2018-06-16 12:00:00,64.51364
min,2010-12-01 00:00:00,0.0
25%,2014-09-08 06:00:00,26.315634
50%,2018-06-16 12:00:00,47.909007
75%,2022-03-24 18:00:00,81.671113
max,2025-12-31 00:00:00,1087.689728
std,,68.207681


In [9]:
# Define lag periods (days)
lag_days = [30, 60, 90, 120, 180]

weather_features = ['temperature_2m_max', 'temperature_2m_min', 
                    'precipitation_sum', 'rain_sum', 'wind_speed_10m_max', 'et0_fao_evapotranspiration']

for feature in weather_features:
    for lag in lag_days:
        col_name = f"{feature}_{lag}d_lag"
        weather[col_name] = weather[feature].rolling(lag).mean().shift(lag)

In [None]:
# Define seasonal windows
def assign_season(month):
    if month in [3, 4, 5]:  # Spring: flowering
        return "spring"
    elif month in [6, 7, 8]:  # Nut development
        return "development"
    elif month in [9, 10]:  # Harvest
        return "harvest"
    else:
        return "off_season"

weather['season'] = weather['date'].dt.month.apply(assign_season)

# Aggregate seasonal averages/sums
seasonal_aggs = weather.groupby('season').agg({
    'temperature_2m_max': 'mean',
    'precipitation_sum': 'sum',
    'et0_fao_evapotranspiration': 'sum'
}).rename(columns={
    'temperature_2m_max': 'season_temp_max_avg',
    'precipitation_sum': 'season_precip_sum',
    'et0_fao_evapotranspiration': 'season_et0_sum'
})

print(seasonal_aggs)

             season_temp_max_avg  season_precip_sum  season_et0_sum
season                                                             
development            24.728945             6918.6         9680.45
harvest                21.149059             7572.9         3788.98
off_season             11.669510            11400.7         4104.89
spring                 15.338929             7573.0         6531.25


In [11]:
# Flagging the extremities  
weather['heatwave_flag'] = (weather['temperature_2m_max'] > 35).astype(int)
weather['coldwave_flag'] = (weather['temperature_2m_min'] < 0).astype(int)
weather['heavy_rain_flag'] = (weather['precipitation_sum'] > 50).astype(int)

In [None]:
# Adding features
hazelnut['price_available'] = (~hazelnut['rolling_14d_adjusted'].isna()).astype(int)
weather['temp_missing'] = weather['temperature_2m_max'].isna().astype(int)
trends['trends_missing'] = trends['trend'].isna().astype(int)
trends['is_trend_zero'] = (trends['trend'] == 0).astype(int) 

In [None]:
weather.rename(columns={"date": "Date"}, inplace=True)
trends.rename(columns={"date": "Date"}, inplace=True)

In [None]:
df = hazelnut[['Date','rolling_14d_adjusted','price_available']].merge(weather, on='Date', how='left')
df = df.merge(currency[['Date','USD_TL']], on='Date', how='left')
df = df.merge(trends, on='Date', how='left')

In [15]:
df.describe()

Unnamed: 0,Date,rolling_14d_adjusted,price_available,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,wind_speed_10m_max,...,et0_fao_evapotranspiration_120d_lag,et0_fao_evapotranspiration_180d_lag,heatwave_flag,coldwave_flag,heavy_rain_flag,temp_missing,USD_TL,trend,trends_missing,is_trend_zero
count,9862,9128.0,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0,9862.0,...,9623.0,9503.0,9862.0,9862.0,9862.0,9862.0,9862.0,5510.0,5510.0,5510.0
mean,2012-07-01 12:00:00,2.784686,0.925573,17.466234,10.938674,17.583371,9.815747,3.393348,3.091199,13.893977,...,2.444704,2.436482,0.000101,0.051916,0.000811,0.0,6.343277,64.51364,0.0,0.186025
min,1999-01-01 00:00:00,0.730755,0.0,-2.8,-10.8,-8.3,-15.4,0.0,0.0,4.3,...,1.011083,1.246722,0.0,0.0,0.0,0.0,0.31522,0.0,0.0,0.0
25%,2005-10-01 06:00:00,1.946889,1.0,12.4,5.7,10.4,2.8,0.0,0.0,10.7,...,1.566833,1.760167,0.0,0.0,0.0,0.0,1.414143,26.315634,0.0,0.0
50%,2012-07-01 12:00:00,2.721873,1.0,18.0,10.8,17.8,9.2,0.3,0.3,12.85,...,2.389167,2.433278,0.0,0.0,0.0,0.0,1.798403,47.909007,0.0,0.0
75%,2019-04-01 18:00:00,3.304607,1.0,23.2,16.8,25.5,17.5,3.9,3.5,16.0,...,3.311042,3.102917,0.0,0.0,0.0,0.0,5.698383,81.671113,0.0,0.0
max,2025-12-31 00:00:00,7.769791,1.0,35.8,25.5,40.7,30.7,78.7,78.7,53.7,...,4.290583,3.861833,1.0,1.0,1.0,0.0,42.9395,1087.689728,0.0,1.0
std,,1.205156,0.262478,6.950824,6.91028,9.366634,8.99672,6.416817,6.04154,4.755491,...,0.897511,0.696568,0.01007,0.22187,0.028471,0.0,9.955254,68.207681,0.0,0.389162


In [16]:
df.columns

Index(['Date', 'rolling_14d_adjusted', 'price_available', 'temperature_2m_max',
       'temperature_2m_min', 'apparent_temperature_max',
       'apparent_temperature_min', 'precipitation_sum', 'rain_sum',
       'wind_speed_10m_max', 'et0_fao_evapotranspiration',
       'temperature_2m_max_30d_lag', 'temperature_2m_max_60d_lag',
       'temperature_2m_max_90d_lag', 'temperature_2m_max_120d_lag',
       'temperature_2m_max_180d_lag', 'temperature_2m_min_30d_lag',
       'temperature_2m_min_60d_lag', 'temperature_2m_min_90d_lag',
       'temperature_2m_min_120d_lag', 'temperature_2m_min_180d_lag',
       'precipitation_sum_30d_lag', 'precipitation_sum_60d_lag',
       'precipitation_sum_90d_lag', 'precipitation_sum_120d_lag',
       'precipitation_sum_180d_lag', 'rain_sum_30d_lag', 'rain_sum_60d_lag',
       'rain_sum_90d_lag', 'rain_sum_120d_lag', 'rain_sum_180d_lag',
       'wind_speed_10m_max_30d_lag', 'wind_speed_10m_max_60d_lag',
       'wind_speed_10m_max_90d_lag', 'wind_speed_10m_

In [17]:
df.drop(columns=['apparent_temperature_max',
       'apparent_temperature_min'], inplace=True)
df.columns

Index(['Date', 'rolling_14d_adjusted', 'price_available', 'temperature_2m_max',
       'temperature_2m_min', 'precipitation_sum', 'rain_sum',
       'wind_speed_10m_max', 'et0_fao_evapotranspiration',
       'temperature_2m_max_30d_lag', 'temperature_2m_max_60d_lag',
       'temperature_2m_max_90d_lag', 'temperature_2m_max_120d_lag',
       'temperature_2m_max_180d_lag', 'temperature_2m_min_30d_lag',
       'temperature_2m_min_60d_lag', 'temperature_2m_min_90d_lag',
       'temperature_2m_min_120d_lag', 'temperature_2m_min_180d_lag',
       'precipitation_sum_30d_lag', 'precipitation_sum_60d_lag',
       'precipitation_sum_90d_lag', 'precipitation_sum_120d_lag',
       'precipitation_sum_180d_lag', 'rain_sum_30d_lag', 'rain_sum_60d_lag',
       'rain_sum_90d_lag', 'rain_sum_120d_lag', 'rain_sum_180d_lag',
       'wind_speed_10m_max_30d_lag', 'wind_speed_10m_max_60d_lag',
       'wind_speed_10m_max_90d_lag', 'wind_speed_10m_max_120d_lag',
       'wind_speed_10m_max_180d_lag', 'et0_fao_e

In [18]:
df.to_csv(features_file, index=False)

In [19]:
df = pd.get_dummies(df, columns=['season'], prefix='season')

In [20]:
df.shape

(9862, 51)

In [21]:
ml_df = df.dropna(subset=['rolling_14d_adjusted'] + [c for c in df.columns if '_lag' in c])

In [22]:
ml_df.shape

(8769, 51)

In [23]:
ml_df.to_csv(ml_file, index=False)

In [25]:
ml_df.columns

Index(['Date', 'rolling_14d_adjusted', 'price_available', 'temperature_2m_max',
       'temperature_2m_min', 'precipitation_sum', 'rain_sum',
       'wind_speed_10m_max', 'et0_fao_evapotranspiration',
       'temperature_2m_max_30d_lag', 'temperature_2m_max_60d_lag',
       'temperature_2m_max_90d_lag', 'temperature_2m_max_120d_lag',
       'temperature_2m_max_180d_lag', 'temperature_2m_min_30d_lag',
       'temperature_2m_min_60d_lag', 'temperature_2m_min_90d_lag',
       'temperature_2m_min_120d_lag', 'temperature_2m_min_180d_lag',
       'precipitation_sum_30d_lag', 'precipitation_sum_60d_lag',
       'precipitation_sum_90d_lag', 'precipitation_sum_120d_lag',
       'precipitation_sum_180d_lag', 'rain_sum_30d_lag', 'rain_sum_60d_lag',
       'rain_sum_90d_lag', 'rain_sum_120d_lag', 'rain_sum_180d_lag',
       'wind_speed_10m_max_30d_lag', 'wind_speed_10m_max_60d_lag',
       'wind_speed_10m_max_90d_lag', 'wind_speed_10m_max_120d_lag',
       'wind_speed_10m_max_180d_lag', 'et0_fao_e