# Writing all_data CSV

In [None]:
import pandas as pd

from core import Config

config = Config()

HISTORIC_PATH = config.eda_filtered_dir / "eda_filtered_historic.csv"
HISTORIC_DTYPES_PATH = config.eda_filtered_dir / "eda_filtered_historic_dtypes.csv"
STATIC_PATH = config.eda_filtered_dir / "eda_filtered_static.csv"
STATIC_DTYPES_PATH = config.eda_filtered_dir / "eda_filtered_static_dtypes.csv"

def _read_dtype_map(path) -> dict[str, str]:
    dtypes_df: pd.DataFrame = pd.read_csv(path, index_col=1)
    return dtypes_df.iloc[:, 1].to_dict()

historic_dtypes: dict[str, str] = _read_dtype_map(HISTORIC_DTYPES_PATH)
static_dtypes: dict[str, str] = _read_dtype_map(STATIC_DTYPES_PATH)

historic_df: pd.DataFrame = pd.read_csv(
    HISTORIC_PATH,
    index_col=[0, 1],
    dtype=historic_dtypes
)
static_df: pd.DataFrame = pd.read_csv(
    STATIC_PATH,
    index_col=0,
    dtype=static_dtypes
)

instrument_index = historic_df.index.get_level_values(0)
static_aligned = static_df.reindex(instrument_index)
static_aligned.index = historic_df.index

all_data: pd.DataFrame = pd.concat([historic_df, static_aligned], axis=1)

del historic_df, static_aligned, static_df, instrument_index, static_dtypes, historic_dtypes, HISTORIC_DTYPES_PATH, STATIC_PATH, HISTORIC_PATH, STATIC_DTYPES_PATH
import gc
gc.collect()

In [None]:
all_data.to_csv(config.dataset_dir / 'all_data.csv')

# Loading All Data

In [5]:
import pandas as pd
from core import Config
config = Config()
all_data = pd.read_csv(config.dataset_dir / 'all_data.csv', index_col=[0, 1])

# Split Training and Validation Set

In [8]:
training_data: pd.DataFrame = all_data.reset_index()
training_data = training_data.convert_dtypes()
training_data.drop(training_data[training_data["TR.UpstreamScope3PurchasedGoodsAndServices"].isna()].index, inplace=True)
y: pd.DataFrame = training_data['TR.UpstreamScope3PurchasedGoodsAndServices'].to_frame()
X: pd.DataFrame = training_data.drop('TR.UpstreamScope3PurchasedGoodsAndServices', axis=1)



In [9]:
group_types = X.columns.to_series().groupby(X.dtypes.apply(lambda x: x.name))
string_columns = group_types.get_group('string')
boolean_columns = group_types.get_group('boolean')
float_columns = group_types.get_group('Float64')
int_columns = group_types.get_group('Int64')

X[string_columns] = X[string_columns].fillna('missing')
X[boolean_columns] = X[boolean_columns].fillna(False)
X[float_columns] = X[float_columns].fillna(0)
X[int_columns] = X[int_columns].fillna(0)

X[string_columns] = X[string_columns].astype('category')
cat_features = [X.columns.get_loc(c) for c in string_columns if c in X]

y = y.fillna(0)

In [None]:
from catboost import Pool
from sklearn.model_selection import train_test_split

data = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validation, y_train, y_validation = data
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)
validation_pool = Pool(
    data=X_validation,
    label=y_validation,
    cat_features=cat_features
)

In [None]:
del all_data, boolean_columns, float_columns, int_columns, string_columns, historic_frame, static_frame, static_dtypes, group_types, config, training_data
import gc
gc.collect()

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(iterations=1000, nan_mode='Min')
model.fit(train_pool, eval_set=validation_pool, verbose=True, plot=True)