# Writing all_data CSV

In [None]:
import pandas as pd
from core import Config

# Read data from the disk
config = Config()
historic_frame: pd.DataFrame = pd.read_csv(config.filtered_dir / 'filtered_historic.csv', index_col=[0, 1])
static_dtypes: pd.DataFrame = pd.read_csv(config.filtered_dir / 'eda_filtered_static_dtypes.csv')
static_frame: pd.DataFrame = pd.read_csv(config.filtered_dir / 'eda_filtered_static.csv', dtype=static_dtypes.values, index_col=0)
static_frame = static_frame.convert_dtypes()
historic_frame = historic_frame.convert_dtypes()
# seperate dataframes by instrument
histordict: dict[str, pd.DataFrame] = {}
statdict: dict[str, pd.DataFrame] = {}
for instrument, df in historic_frame.groupby(level=0):
    histordict.update({instrument: df})
for instrument, df in static_frame.groupby(level=0):
    statdict.update({instrument: df})
# Join historic and static dataframes
all_dataframes: dict[str, pd.DataFrame] = {}
for instrument, historic_df in histordict.items():
    all_dataframes[instrument] = historic_df.join(statdict[instrument])
all_data: pd.DataFrame = pd.concat(all_dataframes.values())
all_data.to_csv(config.dataset_dir / 'all_data.csv')

In [None]:
del histordict, statdict, instrument, df, all_dataframes, historic_df
import gc
gc.collect()

# Loading All Data

In [None]:
import pandas as pd
from core import Config
config = Config()
all_data: pd.DataFrame = pd.read_csv(config.dataset_dir / 'all_data.csv', index_col=[0, 1])

# Split Training and Validation Set

In [None]:
training_data: pd.DataFrame = all_data.reset_index()
training_data = training_data.convert_dtypes()
#all_data2['Date'].astype('category')
y: pd.DataFrame = training_data['TR.UpstreamScope3PurchasedGoodsAndServices'].to_frame()
X: pd.DataFrame = training_data.drop('TR.UpstreamScope3PurchasedGoodsAndServices', axis=1)

In [None]:
group_types = X.columns.to_series().groupby(X.dtypes.apply(lambda x: x.name))
string_columns = group_types.get_group('string')
boolean_columns = group_types.get_group('boolean')
float_columns = group_types.get_group('Float64')
int_columns = group_types.get_group('Int64')

X[string_columns] = X[string_columns].fillna('missing')
X[boolean_columns] = X[boolean_columns].fillna(False)
X[float_columns] = X[float_columns].fillna(0)
X[int_columns] = X[int_columns].fillna(0)

X[string_columns] = X[string_columns].astype('category')
cat_features = [X.columns.get_loc(c) for c in string_columns if c in X]

y = y.fillna(0)

In [None]:
from catboost import Pool
from sklearn.model_selection import train_test_split

data = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validation, y_train, y_validation = data
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)
validation_pool = Pool(
    data=X_validation,
    label=y_validation,
    cat_features=cat_features
)

In [None]:
del all_data, boolean_columns, float_columns, int_columns, string_columns, historic_frame, static_frame, static_dtypes, group_types, config, training_data
import gc
gc.collect()

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(iterations=1000, nan_mode='Min')
model.fit(train_pool, eval_set=validation_pool, verbose=True, plot=True)