## Importing libraries 

In [187]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from xgboost import XGBRegressor
from feature_engine.transformation import LogTransformer
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import tensorflow as tf
pd.set_option("display.max_column", 999)
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

## Reading Data

In [188]:
df_train = pd.read_csv("GDZ_2024/train.csv")
df_test = pd.read_csv("GDZ_2024/test.csv")
df_sub = pd.read_csv("GDZ_2024/sample_submission.csv")
df_holidays = pd.read_csv("GDZ_2024/holidays.csv")
df_weather = pd.read_csv("GDZ_2024/weather.csv")

In [189]:
df_train.head()

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum
0,2021-01-01,izmir-aliaga,5,0
1,2021-01-02,izmir-aliaga,13,0
2,2021-01-03,izmir-aliaga,4,0
3,2021-01-04,izmir-aliaga,9,0
4,2021-01-05,izmir-aliaga,2,0


## Fill missing date

In [190]:
ilce_data = []
for ilce in df_train["ilce"].unique():
    ilce_dates = pd.date_range(start='2021-01-01', end='2024-01-03', freq='D')
    state_df = pd.DataFrame({
        'tarih': ilce_dates,
        'ilce': [ilce] * len(ilce_dates)

    })
    ilce_data.append(state_df)
    
merged_ilce_data = pd.concat(ilce_data)   
df_train["tarih"] = pd.to_datetime(df_train["tarih"])
df_train = pd.merge(df_train, merged_ilce_data, on=['ilce', 'tarih'], how='outer')
df_train = df_train.fillna(0)

In [191]:
df_train["bildirimsiz_sum"] = replace_with_thresholds(df_train, 'ilce', 'bildirimsiz_sum')

## concat train and test sets

In [192]:
df = pd.concat([df_train, df_test], ignore_index=True)

## Known features

In [193]:
df["tarih"] = pd.to_datetime(df["tarih"])
df["day"] = df["tarih"].dt.strftime("%A")
df["month"] = df["tarih"].dt.month
df["year"] = df["tarih"].dt.year
df["is_weekend"] = "No"
df.loc[(df["day"] == "Saturday"), "is_weekend"] = "Yes"
df.loc[(df["day"] == "Sunday"), "is_weekend"] = "Yes"
df["time_since"] = (df["tarih"]-df["tarih"].min()).dt.days
df["time_since_pow_2"] = df["time_since"]**2
df['week_of_year'] = df["tarih"].apply(lambda x: x.weekofyear)
df['quarter'] = df["tarih"].apply(lambda x: x.quarter)
df["month"] = df["month"].astype(str)
df["week_of_year"] = df["week_of_year"].astype(str)
df["quarter"] = df["quarter"].astype(str)

### Holidays feature

In [194]:
df_holidays['tarih'] = pd.to_datetime(df_holidays["Yıl"].astype(str) + '/' +df_holidays["Ay"].astype(str) + '/'+ df_holidays["Gün"].astype(str))
df = pd.merge(df, df_holidays, on='tarih', how='left')
df.drop(["Yıl", "Ay", "Gün"], axis=1, inplace=True)
df = df.rename(columns={'Tatil Adı': 'holidays'})
df["holidays"].fillna("No", inplace=True)
def holiday(text):
    if text=="No":
        return "No"
    else:
        return "Yes"
df["holidays"] = df["holidays"].apply(holiday)
df.reset_index(inplace=True, drop=True)

### Season feature

In [195]:
def add_season(num):
    if num in [3 ,4, 5]:
        return "Spring"
    elif num in [6, 7, 8]:
        return "Summer"
    elif num in [9, 10, 11]:
        return "Fall"
    else:
        return "winter"
    
df["season"] = df["month"].apply(add_season)

### The weather features

In [196]:
df_weather.rename(columns={'name': 'ilce'}, inplace=True)
df_weather["ilce"] = df_weather["ilce"].str.lower()
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather["tarih"] = df_weather["date"].dt.date
daily_avg_data = df_weather.groupby(['ilce', 'tarih']).mean().reset_index()
daily_avg_data["tarih"] = pd.to_datetime(daily_avg_data["tarih"])
df = pd.merge(df, daily_avg_data, on=["ilce", "tarih"], how='left')
df.drop("date", axis=1, inplace=True)
df["prob_tc"] = df["prob_precip_1h:p"] * df["t_2m:C"]
df['apparent_humidity'] = df['relative_humidity_2m:p'] * np.exp((17.27 * df['t_2m:C']) / (237.7 + df['t_2m:C']))

## Unkow features

### lag and windows feature

In [197]:
lag = [29, 30 , 31, 36, 60, 90, 180, 365 ] # 395,515, 575
for i in lag:
    col_name = f'lag{i}'
    df[col_name] = df.groupby(['ilce'])['bildirimsiz_sum'].shift(i)
    df[col_name] = df[col_name].bfill()


  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')
  df[col_name] = df[col_name].fillna(method='bfill')


In [198]:
df.tail()

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum,day,month,year,is_weekend,time_since,time_since_pow_2,week_of_year,quarter,holidays,season,lat,lon,t_2m:C,effective_cloud_cover:p,global_rad:W,relative_humidity_2m:p,wind_dir_10m:d,wind_speed_10m:ms,prob_precip_1h:p,t_apparent:C,prob_tc,apparent_humidity,lag29,lag30,lag31,lag36,lag60,lag90,lag180,lag365
54168,2024-02-29,izmir-beydag,,0.0,Thursday,2,2024,No,1154,1331716,9,1,Yes,winter,38.0847,28.2106,15.3875,16.316667,196.341667,64.316667,150.458333,1.141667,6.945833,16.654167,106.87901,183.794776,2.0,1.0,3.0,1.0,0.0,4.0,0.0,1.0
54169,2024-02-29,izmir-narlidere,,0.0,Thursday,2,2024,No,1154,1331716,9,1,Yes,winter,38.3967,26.997,16.283333,25.470833,188.491667,68.395833,156.383333,2.0125,15.533333,17.704167,252.934444,206.959477,2.0,1.0,1.0,2.0,2.0,2.0,0.0,4.0
54170,2024-02-29,izmir-selcuk,,0.0,Thursday,2,2024,No,1154,1331716,9,1,Yes,winter,37.9508,27.37,15.429167,19.9125,189.5125,69.895833,126.6375,1.670833,11.745833,16.679167,181.22842,200.2721,0.0,0.0,1.0,1.0,3.0,4.0,2.0,0.0
54171,2024-02-29,manisa-kula,,0.0,Thursday,2,2024,No,1154,1331716,9,1,Yes,winter,38.5466,28.6441,13.2375,11.529167,197.520833,61.458333,166.983333,1.2125,4.120833,14.583333,54.549531,152.839687,4.0,7.0,6.0,1.0,5.0,7.0,1.0,1.0
54172,2024-02-29,manisa-ahmetli,,0.0,Thursday,2,2024,No,1154,1331716,9,1,Yes,winter,38.6184,28.6712,13.2875,10.95,196.6625,63.275,145.4,1.370833,3.754167,14.483333,49.88349,157.871177,0.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0


In [199]:
# window_cols = [col for col in df.columns if "window" in str(col)]
lag_cols = [col for col in df.columns if "lag" in str(col)]

## ML method

In [200]:
df2 = df.copy()

### encoding

In [201]:
cols_to_encoding_one_hot = ["ilce","season","day", "month", "week_of_year", "quarter"]

encoded = pd.get_dummies(df2[cols_to_encoding_one_hot], drop_first=True)
df_new = pd.concat([df2, encoded], axis=1)
df_new.drop(columns=cols_to_encoding_one_hot, inplace=True) 
    
# label encoding:
cols_to_label_encoding = ["holidays", "is_weekend"]
label_encoder = LabelEncoder()
for col in cols_to_label_encoding:
    df_new[col]= label_encoder.fit_transform(df_new[col]) 

### Scaling

In [202]:
num_cols_scaling =['t_2m:C','relative_humidity_2m:p','wind_dir_10m:d','wind_speed_10m:ms', 'bildirimli_sum',"effective_cloud_cover:p", "global_rad:W", "prob_precip_1h:p"
                   ,"prob_tc", "apparent_humidity"] + lag_cols
            
# scaler = MinMaxScaler()
scaler = StandardScaler()
for col in num_cols_scaling:
    # df_new[col] = replace_with_thresholds(df_new, col)
    scaled_values = scaler.fit_transform(df_new[col].values.reshape(-1,1))
    df_new[col] = scaled_values
        

### Split to train and val

In [203]:
df_train = df_new[~(df_new["bildirimsiz_sum"].isnull())].reset_index(drop=True)
df_test = df_new[(df_new["bildirimsiz_sum"].isnull())].reset_index(drop=True)

In [204]:
train =  df_train[df_train["tarih"]<'2024-01-01']
val =  df_train[df_train["tarih"]>='2024-01-01']

In [205]:
X_train = train.drop(["bildirimsiz_sum", "tarih"], axis=1)
y_train = train["bildirimsiz_sum"]

In [206]:
X_val = val.drop(["bildirimsiz_sum", 'tarih'], axis=1)
y_val = val["bildirimsiz_sum"]

### model

In [207]:
model = XGBRegressor()

model = XGBRegressor(
    colsample_bytree=0.07,
    learning_rate=0.07,
    max_depth=7,
    min_child_weight=4,
    n_estimators=1000,
    n_jobs=4,  
    objective='reg:linear',
    verbosity=1,  
    subsample=0.7)


In [213]:
df_train = df_new[df_new["tarih"]<='2024-01-31'].reset_index(drop=True)
df_test = df_new[df_new["tarih"]>'2024-01-31'].reset_index(drop=True)

In [214]:
X_train = df_train.drop("bildirimsiz_sum", axis=1)
y_train = df_train["bildirimsiz_sum"]

In [215]:
X_test = df_test.drop("bildirimsiz_sum", axis=1)

In [216]:
X_train.drop(["tarih"], axis=1, inplace=True)
X_test.drop(["tarih"], axis=1, inplace=True)

In [218]:
model.fit(X_train, y_train)



## Submession

In [219]:
y_pre_test = model.predict(X_test)

In [224]:
submession = pd.DataFrame(columns=["tarih","ilce", "bildirimsiz_sum"])
submession["tarih"] = df2[-1363:]["tarih"]
submession["ilce"] = df2[-1363:]["ilce"]
submession["bildirimsiz_sum"] =y_pre_test #  np.around(y_pre_test)
submession["bildirimsiz_sum"] = submession["bildirimsiz_sum"].astype(int)
submession["unique_id"] = ((submession["tarih"].astype(str))+'-'+ (submession["ilce"].astype(str)))
submession = submession[["unique_id", "bildirimsiz_sum"]]
submession.reset_index(drop=True, inplace=True)
submession['bildirimsiz_sum'] = submession['bildirimsiz_sum'].apply(lambda x: 0 if x < 0 else x)
submession.to_csv("sub23.csv", index=False)

In [228]:
# sub 1.79457
# sub23

In [225]:
submession['bildirimsiz_sum'].describe()

count    1363.000000
mean        3.876009
std         2.715330
min         0.000000
25%         2.000000
50%         4.000000
75%         6.000000
max        15.000000
Name: bildirimsiz_sum, dtype: float64

##  funcs

In [1]:
def outlier_thresholds(dataframe, group_column, target_column, q1=0.25, q3=0.75):
    grouped = dataframe.groupby(group_column)[target_column]
    quartile1 = grouped.quantile(q1)
    quartile3 = grouped.quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + (interquantile_range)
    low_limit = quartile1 - (interquantile_range)
    return low_limit, up_limit

def replace_with_thresholds(dataframe, group_column, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, group_column, variable)
    
    for group_name, group_data in dataframe.groupby(group_column):
        low_limit_group, up_limit_group = low_limit[group_name], up_limit[group_name]
        dataframe.loc[(dataframe[group_column] == group_name) & (dataframe[variable] < low_limit_group), variable] = low_limit_group
        dataframe.loc[(dataframe[group_column] == group_name) & (dataframe[variable] > up_limit_group), variable] = up_limit_group
    
    return dataframe[variable]
