In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load scraped data (assuming it's saved as a CSV)
df = pd.read_csv("scraped_amazon_data.csv")

# Feature Engineering
df['brand'] = df['product_title'].apply(lambda x: x.split()[0] if isinstance(x, str) else 'Unknown')  # Extract first word as brand
df['average_rating'] = df['average_rating'].fillna(df['average_rating'].median())  # Handle missing ratings
df['review_count'] = df['review_count'].fillna(0).astype(int)  # Fill missing reviews with 0

# Encoding categorical variables
le = LabelEncoder()
df['brand_encoded'] = le.fit_transform(df['brand'])

# Selecting features and target
features = ['brand_encoded', 'average_rating', 'review_count']
X = df[features]
y = df['price']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_preds = rf_model.predict(X_test_scaled)

# Train XGBoost Regressor
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_preds = xgb_model.predict(X_test_scaled)

# Evaluation
rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
xgb_mae = mean_absolute_error(y_test, xgb_preds)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))

print(f"Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}")
print(f"XGBoost - MAE: {xgb_mae}, RMSE: {xgb_rmse}")
