In [124]:
#Importing libraries
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [125]:
# Assuming you've loaded the dataset into a DataFrame (replace `data.csv` with the actual file path or URL)
df = pd.read_csv('../dataset/data.csv')

In [151]:
# Drop unnecessary columns
columns_to_drop = ['id', 'sourceURLs', 'imageURLs', 'ean', 'upc', 'manufacturerNumber', 'keys']
df = df.drop(columns=columns_to_drop)

In [152]:
# View the first few rows to understand its structure
print(df.head())

  prices.availability prices.condition prices.currency  \
0                 Yes              New             USD   
1                 Yes              New             USD   
2                 Yes              New             USD   
3                 Yes              New             USD   
4     More on the Way              New             USD   

                                     prices.dateSeen  prices.isSale  \
0          2017-05-10T20:00:00Z,2017-05-09T15:00:00Z            NaN   
1  2017-10-10T02:00:00Z,2017-08-12T03:00:00Z,2017...            NaN   
2  2017-10-10T19:00:00Z,2017-09-12T14:00:00Z,2017...            NaN   
3  2017-09-08T05:00:00Z,2017-09-18T13:00:00Z,2017...            NaN   
4                               2017-12-05T13:00:00Z            NaN   

    prices.merchant                                   prices.shipping  \
0       Bestbuy.com                                               NaN   
1       Bestbuy.com                                               NaN   
2    

In [153]:
# Check the column names to ensure they match the ones you want to drop
print(df.columns)

Index(['prices.availability', 'prices.condition', 'prices.currency',
       'prices.dateSeen', 'prices.isSale', 'prices.merchant',
       'prices.shipping', 'prices.sourceURLs', 'asins', 'brand', 'categories',
       'dateAdded', 'dateUpdated', 'manufacturer', 'name', 'primaryCategories',
       'weight', 'price', 'year_added', 'month_added', 'day_added',
       'days_since_added'],
      dtype='object')


In [154]:
# Handle 'prices.isSale' (convert 'TRUE'/'FALSE' to 1/0)
df['prices.isSale'] = df['prices.isSale'].map({'TRUE': 1, 'FALSE': 0})

In [155]:
# Convert 'dateAdded' to timezone-naive if it's timezone-aware
df['dateAdded'] = df['dateAdded'].dt.tz_localize(None)

# Now calculate the 'days_since_added' column
df['days_since_added'] = (pd.to_datetime('today') - df['dateAdded']).dt.days


In [156]:
# Convert 'dateAdded' to timezone-aware if it's not
df['dateAdded'] = df['dateAdded'].dt.tz_localize('UTC', ambiguous='NaT')

# Convert 'today' to a timezone-aware datetime object
today = pd.to_datetime('today').tz_localize('UTC')

# Now calculate the 'days_since_added' column
df['days_since_added'] = (today - df['dateAdded']).dt.days

In [None]:
# Handle weight (ensure it's numeric)
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')


In [158]:
# Target variable: 'price' (you want to predict this)
X = df.drop(columns=['price'])
y = df['price']

In [159]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [160]:
# Apply Standard Scaling to numerical features (like weight, days_since_added)
numerical_cols = ['weight', 'days_since_added']
scaler = StandardScaler()

In [161]:
# Create a preprocessing pipeline for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_cols),
        # Add more transformations for other features (if needed)
    ])


In [162]:
# Build a machine learning pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Example model
])

In [163]:
# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model (example with RMSE)
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 575.5965817290239


Model-Making

In [184]:
from sklearn.model_selection import train_test_split

# Select only the required columns
X = df[['weight', 'year_added', 'month_added', 'day_added', 'days_since_added']]
y = df['price']

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (4348, 5)
X_test shape: (1088, 5)
y_train shape: (4348,)
y_test shape: (1088,)


Training

In [185]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Train RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate RMSE
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f'Random Forest RMSE: {rmse_rf}')

Random Forest RMSE: 562.2273900297887


In [187]:
# Import the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate R² score
r2 = r2_score(y_test, y_pred)

# Output the evaluation metrics
print(f"RMSE: {rmse}")
print(f"R²: {r2}")


RMSE: 562.62573090101
R²: 0.4067512342757382


Hyperparameter Tuning

In [198]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, max_error

# Calculate MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)

# Calculate MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)

# Calculate Explained Variance Score
evs = explained_variance_score(y_test, y_pred)

# Calculate Max Error
max_err = max_error(y_test, y_pred)

# Display the evaluation metrics
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
print(f"MAE (Mean Absolute Error): {mae}")
print(f"MSE (Mean Squared Error): {mse}")
print(f"Explained Variance Score: {evs}")
print(f"Max Error: {max_err}")



RMSE: 562.62573090101
R²: 0.4067512342757382
MAE (Mean Absolute Error): 304.5838728312959
MSE (Mean Squared Error): 316547.7130718957
Explained Variance Score: 0.40795126263630166
Max Error: 4269.253732803532


additional evaluation metrics:

In [199]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, max_error

# Calculate MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)

# Calculate MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)

# Calculate Explained Variance Score
evs = explained_variance_score(y_test, y_pred)

# Calculate Max Error
max_err = max_error(y_test, y_pred)

# Display the evaluation metrics
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
print(f"MAE (Mean Absolute Error): {mae}")
print(f"MSE (Mean Squared Error): {mse}")
print(f"Explained Variance Score: {evs}")
print(f"Max Error: {max_err}")


RMSE: 562.62573090101
R²: 0.4067512342757382
MAE (Mean Absolute Error): 304.5838728312959
MSE (Mean Squared Error): 316547.7130718957
Explained Variance Score: 0.40795126263630166
Max Error: 4269.253732803532


calculate these matrix

In [204]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, max_error

# Assuming y_test and y_pred are already defined:
# y_test: true target values
# y_pred: predicted values by the model


# R-squared (R²)
r2 = r2_score(y_test, y_pred)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)



# Explained Variance Score
evs = explained_variance_score(y_test, y_pred)

# Max Error
max_err = max_error(y_test, y_pred)

# Adjusted R² (Optional but useful in multiple regression)
n = len(y_test)  # number of samples
p = X_test.shape[1]  # number of predictors
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Displaying all evaluation metrics
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
print(f"Adjusted R²: {adj_r2}")
print(f"MAE (Mean Absolute Error): {mae}")
print(f"MSE (Mean Squared Error): {mse}")
print(f"Explained Variance Score: {evs}")
print(f"Max Error: {max_err}")


RMSE: 562.62573090101
R²: 0.4067512342757382
Adjusted R²: 0.4040097889627796
MAE (Mean Absolute Error): 304.5838728312959
MSE (Mean Squared Error): 316547.7130718957
Explained Variance Score: 0.40795126263630166
Max Error: 4269.253732803532
