In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from preprocessing import preprocess

In [None]:
data = pd.read_csv("train.csv", index_col=0)
#data_plus = pd.read_csv("training_extra.csv", index_col=0)
#data = pd.concat([data, data_plus], axis=0)

In [None]:
data.tail()

In [None]:
data = preprocess(data)
data.head()

In [None]:
data.dtypes

---

FIRST SANDBOX - testing before implementing in preprocessing function

In [None]:
numerical_columns = data.select_dtypes(include='number').columns
categorical_columns = data.select_dtypes(exclude='number').columns

In [None]:
data.columns

In [None]:
categorical_columns

END OF THE FIRST SANDBOX

---

In [None]:
print(f"{(len(numerical_columns) + len(categorical_columns))} = {len(data.columns)}")

In [None]:
data.columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
import xgboost as xgb

In [None]:
data.columns

In [None]:
ohe_columns = [
    "Brand",
    "Material",
    "Size",
    #"Compartments",
    "Style",
    "Color"
]

columns_to_scale = [
    "Compartments",
    "Weight Capacity (kg)"
] + [col for col in data.columns if "mean" in col or "Count" in col or "STD" in col or "VAR" in col]

In [None]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
std = StandardScaler()

In [None]:
column_transformer = ColumnTransformer(
    transformers=[
        ("OneHotEncoding", ohe, ohe_columns),
        ("Standardization", std, columns_to_scale)
    ],
    remainder="passthrough",
    n_jobs=-1,
    verbose=True,
    verbose_feature_names_out=False
).set_output(transform="pandas")

In [None]:
pipeline = Pipeline(
    [
        #("Preprocessing", preprocess),
        ("ColumnTransformer", column_transformer),
        #("Model", xgb.XGBRegressor())
    ]
)

In [None]:
X = data.drop(columns='Price')
y = data['Price']

In [None]:
X = pipeline.fit_transform(X)

In [None]:
X.head(3)

---
SECOND SANDBOX

END OF THE SECOND SANDBOX

---

In [None]:
import json

with open('test_scores_sorted_cv=2_150k_lines.json', 'r') as json_file:
    relevant_combinations = json.load(json_file)

combinations_to_add = list(relevant_combinations.keys())[:50]

for combination in combinations_to_add:
    separation = combination.split("xxx")
    first_col = separation[0]
    second_col = separation[1]
    X[combination] = X[first_col] * X[second_col]

In [None]:
X.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

KF = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
#model = LinearRegression()
model = xgb.XGBRegressor()

---

CROSS VAL

In [None]:
#cv_results = cross_validate(model, X, y, cv=KF, verbose=3, return_train_score=True, scoring='neg_root_mean_squared_error', n_jobs=-1)
#cv_results

In [None]:
#cv_results['test_score'].mean()

In [None]:
# only with the first twenty : -38.880299988987474

In [None]:
#model.fit(X, y, verbose=3)

---

GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xgboost_param_grid = {
    'n_estimators': [100, 300],  
    'max_depth': [6, 10],  
    'learning_rate': [0.01, 0.2],
    'subsample': [0.85],
    'colsample_bytree': [0.6, 0.8],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0.1, 1.0],
    'eval_metric': ['logloss'],
    'booster': ['gbtree'],
    'min_child_weight': [1, 3]
}

xgboost_grid = GridSearchCV(
    estimator=model,
    param_grid=xgboost_param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=True,
    n_jobs=-1
)

In [None]:
#xgboost_grid.fit(X, y)
#print(f"Best XGBoost Params: {xgboost_grid.best_params_}")
#print(f"Best model's score : {xgboost_grid.best_score_}")
#xgboost_best_model = xgboost_grid.best_estimator_

---

DATA TEST

In [None]:
test = pd.read_csv('test.csv', index_col=0)
test.head()

In [None]:
test = preprocess(test, submission=True)
test.head()

In [None]:
test = pipeline.transform(test)
test.head()

In [None]:
for combination in combinations_to_add:
    separation = combination.split("xxx")
    first_col = separation[0]
    second_col = separation[1]
    test[combination] = test[first_col] * test[second_col]

In [None]:
preds = pd.concat([pd.DataFrame(test.index), pd.DataFrame(model.predict(test))], axis=1, ignore_index=True)
preds.columns = ['id', 'Price']
print(len(preds))
preds.head()

In [None]:
preds.to_csv('submission.csv', index=False)