# E-commerce Return Rate Reduction Analysis

This notebook performs the following steps:
- Load and clean order and return data
- Merge data to label returns
- Analyze return rates
- Build a logistic regression model
- Export high-risk products for Power BI dashboard


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
orders = pd.read_csv('orders.csv')
returns = pd.read_csv('returns.csv')

# Merge datasets
df = orders.merge(returns[['order_id', 'return_status']], on='order_id', how='left')
df['is_returned'] = df['return_status'].fillna(0).astype(int)

# Analyze return rates
return_rates = df.groupby(['category', 'supplier'])['is_returned'].mean().reset_index()
return_rates.columns = ['category', 'supplier', 'return_percentage']
display(return_rates.sort_values(by='return_percentage', ascending=False))

In [None]:
# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['category', 'region', 'marketing_channel', 'supplier'], drop_first=True)

# Features and target
X = df_encoded.drop(['is_returned', 'order_id', 'product_id', 'customer_id', 'order_date', 'return_status'], axis=1)
y = df_encoded['is_returned']

# Split and model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# Predict return probabilities
df['return_probability'] = model.predict_proba(X)[:, 1]

# Filter high-risk products
high_risk = df[df['return_probability'] > 0.7]
high_risk.to_csv('high_risk_products.csv', index=False)
high_risk[['order_id', 'category', 'supplier', 'return_probability']].head()