In [31]:
import pandas as pd

# Load your dataset
df = pd.read_csv('amz_uk_processed_data.csv')

# Preview the first few rows
print(df.head())

         asin                                              title  \
0  B09B96TG33  Echo Dot (5th generation, 2022 release) | Big ...   
1  B01HTH3C8S  Anker Soundcore mini, Super-Portable Bluetooth...   
2  B09B8YWXDF  Echo Dot (5th generation, 2022 release) | Big ...   
3  B09B8T5VGV  Echo Dot with clock (5th generation, 2022 rele...   
4  B09WX6QD65  Introducing Echo Pop | Full sound compact Wi-F...   

                                              imgUrl  \
0  https://m.media-amazon.com/images/I/71C3lbbeLs...   
1  https://m.media-amazon.com/images/I/61c5rSxwP0...   
2  https://m.media-amazon.com/images/I/61j3SEUjMJ...   
3  https://m.media-amazon.com/images/I/71yf6yTNWS...   
4  https://m.media-amazon.com/images/I/613dEoF9-r...   

                               productURL  stars  reviews  price  \
0  https://www.amazon.co.uk/dp/B09B96TG33    4.7    15308  21.99   
1  https://www.amazon.co.uk/dp/B01HTH3C8S    4.7    98099  23.99   
2  https://www.amazon.co.uk/dp/B09B8YWXDF    4.7  

In [32]:
# Check dataset info
df.info()

# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2222742 entries, 0 to 2222741
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   asin               object 
 1   title              object 
 2   imgUrl             object 
 3   productURL         object 
 4   stars              float64
 5   reviews            int64  
 6   price              float64
 7   isBestSeller       bool   
 8   boughtInLastMonth  int64  
 9   categoryName       object 
dtypes: bool(1), float64(2), int64(2), object(5)
memory usage: 154.7+ MB
asin                 0
title                0
imgUrl               0
productURL           0
stars                0
reviews              0
price                0
isBestSeller         0
boughtInLastMonth    0
categoryName         0
dtype: int64
              stars       reviews         price  boughtInLastMonth
count  2.222742e+06  2.222742e+06  2.222742e+06       2.222742e+06
mean   2.031870e+00  3.821617e+02  9.425737e+01      

In [33]:
# Remove duplicates based on ASIN
df = df.drop_duplicates(subset=['asin'])

# Remove unnecessary columns (like imgUrl and productURL)
df = df.drop(['imgUrl', 'productURL'], axis=1)

# Strip leading/trailing spaces in textual columns
df['title'] = df['title'].str.strip()

# Convert boolean columns to integers (e.g., 'isBestSeller')
df['isBestSeller'] = df['isBestSeller'].astype(int)

# Preview cleaned data
print(df.head())

         asin                                              title  stars  \
0  B09B96TG33  Echo Dot (5th generation, 2022 release) | Big ...    4.7   
1  B01HTH3C8S  Anker Soundcore mini, Super-Portable Bluetooth...    4.7   
2  B09B8YWXDF  Echo Dot (5th generation, 2022 release) | Big ...    4.7   
3  B09B8T5VGV  Echo Dot with clock (5th generation, 2022 rele...    4.7   
4  B09WX6QD65  Introducing Echo Pop | Full sound compact Wi-F...    4.6   

   reviews  price  isBestSeller  boughtInLastMonth    categoryName  
0    15308  21.99             0                  0  Hi-Fi Speakers  
1    98099  23.99             1                  0  Hi-Fi Speakers  
2    15308  21.99             0                  0  Hi-Fi Speakers  
3     7205  31.99             0                  0  Hi-Fi Speakers  
4     1881  17.99             0                  0  Hi-Fi Speakers  


In [34]:
# Check for missing values
print(df.isnull().sum())

# If there are missing values, you can decide to fill them or drop the rows
# For example, fill missing 'stars' with mean value
df['stars'] = df['stars'].fillna(df['stars'].mean())

# Similarly, you can drop rows where critical columns like 'asin' or 'title' are missing
df = df.dropna(subset=['asin', 'title'])

# Check again to ensure no critical missing values
print(df.isnull().sum())

asin                 0
title                0
stars                0
reviews              0
price                0
isBestSeller         0
boughtInLastMonth    0
categoryName         0
dtype: int64
asin                 0
title                0
stars                0
reviews              0
price                0
isBestSeller         0
boughtInLastMonth    0
categoryName         0
dtype: int64


In [35]:
import numpy as np

# Apply log transformation to the 'reviews' column
df['log_reviews'] = np.log1p(df['reviews'])  # log(1 + reviews) to avoid issues with zero values


In [36]:
# Create price bands
df['price_band'] = pd.cut(df['price'], bins=[0, 20, 50, 100, 200, 500], labels=['Low', 'Mid', 'High', 'Premium', 'Luxury'])


In [37]:
# Create rating categories
df['rating_category'] = pd.cut(df['stars'], bins=[0, 3, 4, 5], labels=['Low', 'Medium', 'High'])


In [38]:
# Interaction metric based on reviews, price, and stars
df['interaction_score'] = (df['reviews'] * df['stars']) / df['price']


In [39]:
# Encode the categorical columns 'price_band', 'categoryName', and 'rating_category'
df = pd.get_dummies(df, columns=['price_band', 'categoryName', 'rating_category'], drop_first=True)

# Check the new structure after encoding
print(df.head())


         asin                                              title  stars  \
0  B09B96TG33  Echo Dot (5th generation, 2022 release) | Big ...    4.7   
1  B01HTH3C8S  Anker Soundcore mini, Super-Portable Bluetooth...    4.7   
2  B09B8YWXDF  Echo Dot (5th generation, 2022 release) | Big ...    4.7   
3  B09B8T5VGV  Echo Dot with clock (5th generation, 2022 rele...    4.7   
4  B09WX6QD65  Introducing Echo Pop | Full sound compact Wi-F...    4.6   

   reviews  price  isBestSeller  boughtInLastMonth  log_reviews  \
0    15308  21.99             0                  0     9.636196   
1    98099  23.99             1                  0    11.493743   
2    15308  21.99             0                  0     9.636196   
3     7205  31.99             0                  0     8.882669   
4     1881  17.99             0                  0     7.540090   

   interaction_score  price_band_Mid  ...  categoryName_Vases  \
0        3271.832651            True  ...               False   
1       19219.06

In [40]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Apply normalization to numerical columns: 'reviews', 'price', 'interaction_score', etc.
df[['reviews', 'price', 'interaction_score']] = scaler.fit_transform(df[['reviews', 'price', 'interaction_score']])

# Check the normalized values
print(df.head())


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
from sklearn.model_selection import train_test_split

# For content-based filtering or general purpose splitting
train, test = train_test_split(df, test_size=0.25, random_state=42)

# Display train and test sizes
print(f"Train set: {train.shape}")
print(f"Test set: {test.shape}")


In [None]:
# Save the processed dataset to a new CSV file
df.to_csv('processed_ecommerce_data.csv', index=False)

print("Preprocessing complete and file saved!")
