# Part 2: Data Cleaning & Preprocessing
This notebook handles missing values, cleans numeric columns, checks duplicates, and prepares the dataset for analysis.

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('20191226-items.csv')
df.head()

## 1. Handle Missing Values

In [None]:
df['brand'].fillna('Unknown', inplace=True)
df['brand'].isnull().sum()

## 2. Clean Numeric Columns

In [None]:
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)
df['originalPrice'] = pd.to_numeric(df['originalPrice'], errors='coerce').fillna(0)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0)
df['totalReviews'] = pd.to_numeric(df['totalReviews'], errors='coerce').fillna(0)
df[['price', 'originalPrice', 'rating', 'totalReviews']].describe()

## 3. Check and Remove Duplicates

In [None]:
df.drop_duplicates(subset=['asin'], inplace=True)
df.shape

## 4. Standardize Brand Names

In [None]:
df['brand'] = df['brand'].str.title()
df['brand'].unique()[:10]

## 5. Drop Unnecessary Columns

In [None]:
df_cleaned = df.drop(columns=['url', 'image', 'reviewUrl'])
df_cleaned.head()

## Save Cleaned Dataset

In [None]:
df_cleaned.to_csv('cleaned_items.csv', index=False)
print("Data cleaning complete. Cleaned dataset saved as 'cleaned_items.csv'")