# Part 1: Introduction & Dataset Overview

In [None]:

import pandas as pd
import numpy as np

df = pd.read_csv("20191226-items.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


# Part 2: Data Cleaning & Preprocessing

In [None]:

df['brand'] = df['brand'].fillna("Unknown")
df['category'] = df['category'].fillna("Uncategorized")
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'] = df['price'].fillna(df['price'].median())
df = df.drop_duplicates()
df.info()


# Part 3: Exploratory Data Analysis

In [None]:

print(df.describe())
print("\nTop categories:\n", df['category'].value_counts().head(10))
print("\nTop brands:\n", df['brand'].value_counts().head(10))


# Part 4: Data Visualization

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title("Price Distribution")
plt.show()

df['category'].value_counts().head(10).plot(kind='bar', figsize=(10,5))
plt.title("Top 10 Categories")
plt.show()

top_brands = df['brand'].value_counts().head(5).index
sns.boxplot(data=df[df['brand'].isin(top_brands)], x="brand", y="price")
plt.title("Price Distribution by Top Brands")
plt.show()


# Part 5: Predictive Modeling

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

model_df = df[['price', 'category', 'brand']].copy()
le_cat = LabelEncoder()
le_brand = LabelEncoder()
model_df['category_encoded'] = le_cat.fit_transform(model_df['category'])
model_df['brand_encoded'] = le_brand.fit_transform(model_df['brand'])

X = model_df[['category_encoded','brand_encoded']]
y = model_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))


# Part 6: Ethical Considerations


- Price prediction models risk reinforcing inequality  
- Bias in brand representation may favor global brands  
- Transparency in feature encoding and modeling is critical  
- User data expansion requires privacy-preserving techniques  


# Part 7: Social Justice Implications


- Skewed dataset towards certain brands/categories raises fairness concerns  
- Local brands may be underrepresented  
- Findings must be communicated responsibly  
- Inclusive datasets ensure accessibility across populations  


# Part 8: Conclusions & Recommendations


**Key Findings**  
- Cleaning resolved missing values  
- EDA revealed skew in prices and brand imbalance  
- Predictive models achieved moderate performance  
- Ethical and social justice risks identified  

**Recommendations**  
- Perform fairness audits  
- Apply differential privacy if user data is added  
- Ensure transparency in model use  
- Regularly update datasets with diverse sources  
