### Model Selection,Training and Evaluation.

In [2]:
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error


)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
import sys, os
sys.path.append(os.path.abspath(os.path.join("../scripts")))

In [4]:
root_path = os.path.abspath("..")
if root_path not in sys.path:
    sys.path.insert(0, root_path)

from data_preprocessing import DataPreprocessing


In [5]:
dp = DataPreprocessing()
data =  dp.read_data('../data/labeled_data/synthetic_product_listings_gpt_4o_mini_encoded_labeled.csv')

In [6]:
data

Unnamed: 0,category,brand,condition,seller_reputation,log_price
0,0,0,1,4,4.330733
1,1,1,1,2,7.170881
2,2,10,1,5,7.090910
3,2,8,0,4,5.860786
4,2,32,0,3,8.071219
...,...,...,...,...,...
990,2,32,1,3,7.467942
991,1,12,0,1,7.090910
992,0,0,0,5,4.330733
993,1,5,0,3,7.090910


#### Linear Regression

In [7]:
def split_and_prepare_data(df: DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 1):

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input data must be a pandas DataFrame.")

    if target_column not in df.columns:
        raise ValueError(
            f"The target column '{target_column}' is not in the DataFrame."
        )

    # Split the dataset into training and testing sets
    X_train, X_test = train_test_split(
        df, test_size=test_size, random_state=random_state
    )

    # Reset the index for both splits
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    # Extract target values
    y_train = X_train[target_column].values
    y_test = X_test[target_column].values

    # Remove the target column from features
    X_train = X_train.drop(columns=[target_column])
    X_test = X_test.drop(columns=[target_column])

    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = split_and_prepare_data(data, target_column="log_price", test_size=0.2, random_state=1)

In [9]:
def train_and_evaluate_lr(X_train, X_test, y_train, y_test):
    # Train Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print evaluation metrics
    print("\n--- Linear Regression Model Evaluation ---")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.2f}")

    return model

In [10]:

train_and_evaluate_lr(X_train, X_test, y_train, y_test)


--- Linear Regression Model Evaluation ---
Mean Squared Error (MSE): 0.39
Mean Absolute Error (MAE): 0.51
R² Score: 0.79



#### Decision Tree Regressor

In [11]:
def evaluate_model_dtr(X_train, X_test, y_train, y_test):
    # Train Decision Tree model
    model = DecisionTreeRegressor(max_depth=20, min_samples_split=10)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate regression metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    #rmse = root_mean_squared_error(y_test, y_pred)
    #rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Print metrics
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    #print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print(f"MAPE scores: {mape:.2f}")

    return model


In [12]:
evaluate_model_dtr(X_train, X_test, y_train, y_test)

Mean Absolute Error (MAE): 0.20
Mean Squared Error (MSE): 0.08
R² Score: 0.96
MAPE scores: 0.03


In [31]:
# Define the parameter grid
param_grid = {
    "max_features": ["sqrt", "log2", None],
    "max_depth": [None, 10, 20, 30], 
    "min_samples_split": [2, 5, 10], 
}

# Initialize the DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=1)

# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="r2",  
    cv=5,  
    n_jobs=-1,
    verbose=2,  
    error_score="raise", 
)

# Perform the grid search
print("Starting grid search...")
grid_search.fit(X_train, y_train)
print("Grid search completed.")

# Display the best parameters and corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
mae =  mean_absolute_error(y_test, best_model.predict(X_test)) 
mse = mean_squared_error(y_test, best_model.predict(X_test))
r2 = r2_score(y_test, best_model.predict(X_test))
print("Test R2 Score:", test_score)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)

print("R² Score:", r2)

Starting grid search...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=10; total ti

#### RandomForestRegressor

In [14]:
def evaluate_model_rfr(X_train, X_test, y_train, y_test):
    # Train Random Forest Regressor
    model = RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_split=10, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate regression metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Print metrics
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}")

    return model

In [15]:
evaluate_model_rfr(X_train, X_test, y_train, y_test)

Mean Absolute Error (MAE): 0.19
Mean Squared Error (MSE): 0.07
R² Score: 0.96
Mean Absolute Percentage Error (MAPE): 0.03


In [16]:
# Define the parameter grid
param_grid = {
    "max_features": ["sqrt", "log2", None],
    "max_depth": [None, 10,15, 20, 30], 
    "min_samples_split": [2, 3,5, 10], 
}

# Initialize the DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=1)

# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="r2",  
    cv=5,  
    n_jobs=-1,
    verbose=2,  
    error_score="raise", 
)

# Perform the grid search
print("Starting grid search...")
grid_search.fit(X_train, y_train)
print("Grid search completed.")

# Display the best parameters and corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test R2 Score:", test_score)

Starting grid search...
Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=3; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=3; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=3; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=3; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=3; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=5; total tim

#### XGBoost for Regression

In [17]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    max_depth=6,                  
    learning_rate=0.01,            
    colsample_bytree=0.8,         
    random_state=1                
)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error (MAE): 0.52
Mean Squared Error (MSE): 0.34
R² Score: 0.81


In [18]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Set up parameter grid
param_grid = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [3, 6, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    random_state=42
)

# Perform the search
random_search.fit(X_train_scaled, y_train)

# Best parameters found
print("Best parameters found: ", random_search.best_params_)

# Use the best model
best_model = random_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test_scaled)
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best Model - Mean Absolute Error (MAE): {mae_best:.2f}")
print(f"Best Model - Mean Squared Error (MSE): {mse_best:.2f}")
print(f"Best Model - R² Score: {r2_best:.2f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time=   1.4s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time=   1.5s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=1000, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=1000, subsample=0.8; total time=   0.8s
[CV] END

In [19]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

# Example dataset
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Best Model - Mean Absolute Error (MAE): {mae:.2f}")
print("Mean Squared Error:", mse)
print(f"Best Model - R² Score: {r2:.2f}")


Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0, 'subsample': 1.0}
Best Model - Mean Absolute Error (MAE): 0.20
Mean Squared Error: 0.07134533249437074
Best Model - R² Score: 0.96


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor

In [21]:
# Define base models
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)),
    ('dt', DecisionTreeRegressor(max_depth=5, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=5,
                         reg_alpha=0.1, reg_lambda=0.1, subsample=1.0, colsample_bytree=1.0, random_state=42))
]

In [22]:
# Define Stacking Regressor with Linear Regression as meta-model
stacking_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression(), cv=5)

In [23]:
# Train the stacking model
stacking_model.fit(X_train, y_train)

# Predict on test set
y_pred = stacking_model.predict(X_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Stacking Model - Mean Absolute Error (MAE): {mae:.2f}")
print(f"Stacking Model - Mean Squared Error (MSE): {mse:.2f}")
print(f"Stacking Model - R² Score: {r2:.2f}")

Stacking Model - Mean Absolute Error (MAE): 0.19
Stacking Model - Mean Squared Error (MSE): 0.07
Stacking Model - R² Score: 0.96


In [24]:
from sklearn.linear_model import Ridge

# Use Ridge Regression as the final estimator
stacking_model = StackingRegressor(estimators=base_models, final_estimator=Ridge(alpha=1.0), cv=5)

# Train & Evaluate
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)

print(f"Stacking Model (Ridge) - R² Score: {r2_score(y_test, y_pred):.2f}")

Stacking Model (Ridge) - R² Score: 0.96


In [25]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    xgb_params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }

    model = XGBRegressor(**xgb_params, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="r2").mean()
    return score

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best parameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

[I 2025-02-24 12:50:25,146] A new study created in memory with name: no-name-83415c39-165f-4857-b172-5e6bb88b1c73
[I 2025-02-24 12:50:27,267] Trial 0 finished with value: 0.9502330824191484 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.21521446906514857, 'min_child_weight': 9, 'subsample': 0.7421169678876207, 'colsample_bytree': 0.575094064263824, 'reg_alpha': 0.7211541590665105, 'reg_lambda': 0.2983674334579016}. Best is trial 0 with value: 0.9502330824191484.
[I 2025-02-24 12:50:29,241] Trial 1 finished with value: 0.9486511267274688 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.19169697288711363, 'min_child_weight': 10, 'subsample': 0.5989496004319462, 'colsample_bytree': 0.9745749037854476, 'reg_alpha': 0.1500661761937374, 'reg_lambda': 0.4050817254001653}. Best is trial 0 with value: 0.9502330824191484.
[I 2025-02-24 12:50:34,026] Trial 2 finished with value: 0.9408487736799644 and parameters: {'n_estimators': 800, 'max_depth':

Best Hyperparameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.07978140138335663, 'min_child_weight': 9, 'subsample': 0.6887611446098232, 'colsample_bytree': 0.8528536548870566, 'reg_alpha': 0.5185370737591963, 'reg_lambda': 0.6612089942938043}


In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use best parameters from Optuna
best_xgb = XGBRegressor(**study.best_params, random_state=42)

# Train model
best_xgb.fit(X_train, y_train)

# Predictions
y_pred = best_xgb.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Optimized XGBoost - MAE: {mae:.2f}")
print(f"Optimized XGBoost - MSE: {mse:.2f}")
print(f"Optimized XGBoost - R² Score: {r2:.2f}")

Optimized XGBoost - MAE: 0.19
Optimized XGBoost - MSE: 0.07
Optimized XGBoost - R² Score: 0.96


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Sample dataset (Replace with your own dataset)
X, y = your_features, your_target  # Ensure you have defined your features & target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBRegressor with best parameters
xgb_regressor = xgb.XGBRegressor(
    colsample_bytree=1.0,
    gamma=0.1,
    learning_rate=0.2,
    max_depth=9,
    min_child_weight=5,
    n_estimators=100,
    reg_alpha=0.1,
    reg_lambda=0,
    subsample=1.0,
    objective="reg:squarederror",  # Regression objective
    random_state=42
)

# Train the model
xgb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = xgb_regressor.predict(X_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.6f}")
print(f"R² Score: {r2:.2f}")

In [32]:
X_test

Unnamed: 0,category,brand,condition,seller_reputation
0,0,0,0,2
1,0,0,1,4
2,1,1,0,3
3,0,0,1,4
4,0,0,0,1
...,...,...,...,...
194,2,28,0,3
195,1,1,0,3
196,0,0,0,5
197,0,0,0,2


In [33]:
y_test

array([4.33073334, 4.79579055, 7.17088079, 4.4543473 , 4.79579055,
       4.33073334, 8.07121854, 4.79579055, 7.49609735, 6.74641213,
       7.28000825, 4.79579055, 7.24493441, 7.09090982, 4.4543473 ,
       7.09090982, 5.86078622, 7.17088079, 7.46794233, 7.31388683,
       4.33073334, 4.79579055, 7.28000825, 4.33073334, 8.07121854,
       4.4543473 , 4.33073334, 8.07121854, 4.33073334, 8.07121854,
       7.17088079, 4.33073334, 7.49609735, 4.4543473 , 4.79579055,
       7.09090982, 7.46794233, 7.17088079, 7.17088848, 7.28000825,
       7.80425138, 4.79579055, 4.79579055, 7.13169851, 7.17088079,
       7.13169851, 7.17088079, 7.28000825, 7.31388017, 4.33073334,
       7.24493441, 7.49609735, 4.51085951, 8.07121854, 4.79579055,
       7.17088079, 7.0909015 , 7.82444593, 4.33073334, 7.17088079,
       4.39444915, 4.79579055, 7.31388017, 7.28000825, 4.79579055,
       8.16080392, 7.17088079, 7.17088079, 4.79579055, 7.17088079,
       7.13169851, 7.09090982, 4.4543473 , 4.79579055, 7.31388

In [34]:
X_train

Unnamed: 0,category,brand,condition,seller_reputation
0,0,18,0,2
1,0,18,1,1
2,0,18,0,1
3,0,18,0,5
4,1,1,0,3
...,...,...,...,...
791,0,0,0,5
792,0,0,1,3
793,0,0,1,3
794,1,12,0,2


In [35]:
y_train

array([4.79579055, 4.79579055, 4.79579055, 4.79579055, 7.17088079,
       7.49609735, 7.17088079, 7.17088079, 8.16080392, 8.07121854,
       8.07121854, 4.79579055, 4.79579055, 7.17088079, 7.17088079,
       7.20860034, 4.51085951, 4.79579055, 4.4543473 , 4.79579055,
       7.46794233, 8.07121854, 7.09090982, 5.86078622, 4.33073334,
       4.79579055, 6.11146734, 4.33073334, 7.17088079, 6.62140565,
       8.07121854, 7.09090982, 4.4543473 , 7.17088079, 7.09090982,
       7.28000825, 7.17088079, 4.79579055, 4.79579055, 7.17088079,
       7.46794233, 7.09090982, 7.17088079, 7.09090982, 7.28000825,
       5.86078622, 4.39444915, 7.09090982, 6.62140565, 4.79579055,
       7.28000825, 7.09090982, 7.09090982, 7.31388683, 7.49609735,
       7.09090982, 4.79579055, 4.79579055, 7.31388683, 4.26267988,
       4.33073334, 4.79579055, 8.07121854, 7.20860034, 7.28000825,
       7.28000825, 4.79579055, 7.24493441, 4.33073334, 7.0909015 ,
       7.09090982, 7.17088079, 7.13169851, 7.09090982, 4.79579

In [36]:
y_train.shape

(796,)

In [37]:
y_test.shape

(199,)

In [38]:
X_train.shape

(796, 4)