In [1]:
import os
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gzip
import shutil

In [164]:
# loading the dataset
df = pd.read_csv("C:/Users/ADMIN/return_risk/data/raw_data/amazon_co-ecommerce_sample.csv")

df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   index                                        10000 non-null  int64  
 1   uniq_id                                      10000 non-null  object 
 2   product_name                                 10000 non-null  object 
 3   manufacturer                                 9993 non-null   object 
 4   price                                        8565 non-null   object 
 5   number_available_in_stock                    7500 non-null   object 
 6   number_of_reviews                            9982 non-null   object 
 7   number_of_answered_questions                 9235 non-null   float64
 8   average_review_rating                        9982 non-null   object 
 9   amazon_category_and_sub_category             9310 non-null   object 
 10 

Unnamed: 0,index,number_of_answered_questions
count,10000.0,9235.0
mean,4999.5,1.834976
std,2886.89568,2.517268
min,0.0,1.0
25%,2499.75,1.0
50%,4999.5,1.0
75%,7499.25,2.0
max,9999.0,39.0


In [165]:
print(df.columns.tolist())

['index', 'uniq_id', 'product_name', 'manufacturer', 'price', 'number_available_in_stock', 'number_of_reviews', 'number_of_answered_questions', 'average_review_rating', 'amazon_category_and_sub_category', 'customers_who_bought_this_item_also_bought', 'description', 'product_information', 'product_description', 'items_customers_buy_after_viewing_this_item', 'customer_questions_and_answers', 'customer_reviews', 'sellers']


In [166]:
# drop fully empty rows, columns and duplicates
df.dropna(how = "all" ,inplace = True)
df.dropna(axis = 1, how = "all", inplace = True)
df.drop_duplicates(inplace=True)

In [167]:
#convert price to numeric (remove currency symbols)
df['price'] = (
    pd.to_numeric(
        df['price'].astype(str).str.replace(r'[^\d.]', '', regex=True),
        errors='coerce'
    )
)

#convert rating and review-related columns to numeric
df['number_of_reviews'] = pd.to_numeric(df['number_of_reviews'], errors='coerce')
df['number_of_answered_questions'] = pd.to_numeric(df['number_of_answered_questions'], errors='coerce')
df['average_review_rating'] = (
    df['average_review_rating']
    .astype(str)
    .str.extract(r'([\d.]+)')[0]  # get first capture group
    .astype(float)
)

#clean stock values
df['number_available_in_stock'] = (
    df['number_available_in_stock']
    .astype(str)
    .str.extract(r'(\d+)')[0]   # get first capture group
    .astype(float)
)

#input median values for missing values
for col in ['price', 'number_of_reviews', 'number_of_answered_questions',
            'average_review_rating', 'number_available_in_stock']:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

In [168]:
# Price bins — categorize price into bands
df['price_bin'] = pd.cut(
    df['price'],
    bins=[0, 20, 50, 100, 200, 500, np.inf],
    labels=['£0–20', '£20–50', '£50–100', '£100–200', '£200–500', '£500+']
)

In [177]:
# add return status based on average review rating
rating_threshold = 3.5
price_bin =
df['return_status'] = (df['average_review_rating']<rating_threshold).astype(int)

In [169]:
nltk.download('stopwords')
default_stopwords = set(stopwords.words('english'))
custom_words = {
    'him', 'her', 'his', 'hers', 'this', 'that', 'these', 'those',
    'it', 'its', 'my', 'your', 'yours', 'our', 'ours', 'their', 'theirs',
    'he', 'she', 'they', 'them', 'we', 'us'
}

all_stopwords = default_stopwords.union(custom_words)

def clean_review_custom(text):
    if pd.isnull(text):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stopwords + custom words
    words = text.split()
    words = [word for word in words if word not in all_stopwords]
    # Join back to string
    return " ".join(words)

# Apply it
df['customer_reviews_clean'] = df['customer_reviews'].apply(clean_review_custom)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [170]:
# sentiment analysis in customer reviews
def get_sentiment(text):
    if pd.isna(text):
        return 0
    return TextBlob(text).sentiment.polarity
df['review_sentiment'] = df['customer_reviews_clean'].apply(get_sentiment)

def label_sentiment(score):
    if score < -0.2:
        return 'Negative'
    elif score > 0.2:
        return 'Positive'
    else:
        return 'Neutral'

df['review_sentiment_label'] = df['review_sentiment'].apply(label_sentiment)

#rating bin
def rating_bin(rating):
    if pd.isna(rating):
        return 'Unknown'
    elif rating < 2.5:
        return 'Low'
    elif rating < 4.0:
        return 'Medium'
    else:
        return 'High'

df['rating_bin'] = df['average_review_rating'].apply(rating_bin)

#category risk flag
def category_risk(cat):
    if pd.isna(cat):
        return 'Unknown'
    cat = cat.lower()
    high_risk_keywords = ['dress', 'jeans', 'shirt', 'blouse', 'shoes', 'pants', 'trousers', 'skirt']
    for word in high_risk_keywords:
        if word in cat:
            return 'High'
    return 'Low'

df['category_risk'] = df['amazon_category_and_sub_category'].apply(category_risk)

# description length
df['description_length'] = df['description'].astype(str).apply(len)

def description_bin(length):
    if length < 50:
        return 'Short'
    elif length < 200:
        return 'Medium'
    else:
        return 'Long'

df['description_bin'] = df['description_length'].apply(description_bin)

#known brand flag
trusted_brands = ['Nike', 'Adidas', 'Zara', 'H&M', 'Levi', 'Uniqlo', 'Puma', 'North Face']

def is_known_brand(manu):
    if pd.isna(manu):
        return 0
    for brand in trusted_brands:
        if brand.lower() in manu.lower():
            return 1
    return 0

df['known_brand'] = df['manufacturer'].apply(is_known_brand)

#engagement bin
def engagement_bin(num):
    if num == 0:
        return 'None'
    elif num < 3:
        return 'Low'
    elif num < 10:
        return 'Medium'
    else:
        return 'High'

df['engagement'] = df['number_of_answered_questions'].apply(engagement_bin)

#cross-sell length
df['cross_sell_length'] = df['customers_who_bought_this_item_also_bought'].astype(str).apply(len)

In [172]:
# downloading updated csv to manual QA return labels
output_dir = r'C:/Users/ADMIN/return_risk/data/processed_data'
os.makedirs(output_dir, exist_ok=True)

final_name = "to_qa_return_labels_amazon.csv"

output_path = os.path.join(output_dir, final_name)

df.to_csv(output_path, index=False)

print(f"✅ File saved at: {output_path}")
print(f"📏 File size: {os.path.getsize(output_path) / (1024**2):.2f} MB")


✅ File saved at: C:/Users/ADMIN/return_risk/data/processed_data\to_qa_return_labels_amazon.csv
📏 File size: 38.99 MB


In [None]:
#dummy return label for training
np.random.seed(42)  # for reproducibility
df['return_label'] = np.random.choice([0, 1], size=len(df))

In [151]:
df

Unnamed: 0,index,uniq_id,product_name,manufacturer,price,number_available_in_stock,number_of_reviews,number_of_answered_questions,average_review_rating,amazon_category_and_sub_category,...,review_sentiment,review_sentiment_label,rating_bin,category_risk,description_length,description_bin,known_brand,engagement,cross_sell_length,return_label
0,0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hornby,3.42,5.0,15.0,1.0,4.9,Hobbies > Model Trains & Railway Sets > Rail V...,...,0.372154,Positive,High,Low,72,Medium,0,Low,367,0
1,1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,16.99,5.0,2.0,1.0,4.5,Hobbies > Model Trains & Railway Sets > Rail V...,...,0.800000,Positive,High,Low,134,Medium,0,Low,489,1
2,2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,9.99,2.0,17.0,2.0,3.9,Hobbies > Model Trains & Railway Sets > Rail V...,...,0.276351,Positive,Medium,Low,357,Long,0,Low,445,0
3,3,e12b92dbb8eaee78b22965d2a9bbbd9f,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,39.99,5.0,1.0,2.0,5.0,Hobbies > Model Trains & Railway Sets > Rail V...,...,0.500000,Positive,High,Low,57,Medium,0,Low,3,0
4,4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,32.19,5.0,3.0,2.0,4.7,Hobbies > Model Trains & Railway Sets > Rail V...,...,0.309583,Positive,High,Low,398,Long,0,Low,422,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,44d6967f083825a5de36ad4865a65bcd,Batman 1966 TV Series Action Figures - The Rid...,Mattel,22.95,5.0,3.0,3.0,5.0,Hobbies > Collectible Figures & Memorabilia > ...,...,0.182593,Neutral,High,Low,59,Medium,0,Medium,419,1
9996,9996,08f0747b6fc6687215ffb994c3a6fb32,"Star Wars Costume, Kids Stormtrooper Costume S...",Star Wars,39.99,5.0,1.0,3.0,4.0,Characters & Brands > Star Wars > Toys,...,0.200000,Neutral,High,Low,3,Short,0,Medium,346,0
9997,9997,bf6cc073f8f24e6e338190fa16f6ee9d,Defiance Lawkeeper Metal Badge Prop Replica,Olde Scotland Yard Ltd.,43.99,3.0,1.0,3.0,5.0,Novelty & Special Use > Novelty > Accessories ...,...,1.000000,Positive,High,Low,161,Medium,0,Medium,3,1
9998,9998,cd783d0b8b44e631b9788b203eaaefae,Justice League of America Series 3 Green Lante...,DC Comics,49.81,3.0,1.0,3.0,5.0,Hobbies > Collectible Figures & Memorabilia > ...,...,0.036182,Neutral,High,Low,434,Long,0,Medium,3,1


In [152]:
df.columns

Index(['index', 'uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category',
       'customers_who_bought_this_item_also_bought', 'description',
       'product_information', 'product_description',
       'items_customers_buy_after_viewing_this_item',
       'customer_questions_and_answers', 'customer_reviews', 'sellers',
       'price_bin', 'customer_reviews_clean', 'review_sentiment',
       'review_sentiment_label', 'rating_bin', 'category_risk',
       'description_length', 'description_bin', 'known_brand', 'engagement',
       'cross_sell_length', 'return_label'],
      dtype='object')

In [153]:
# Ordinal encode rating bin
rating_map = {'Low': 0, 'Medium': 1, 'High': 2, 'Unknown': -1}
df['rating_bin_num'] = df['rating_bin'].map(rating_map)

# Ordinal encode category risk
category_map = {'Low': 0, 'High': 1, 'Unknown': -1}
df['category_risk_num'] = df['category_risk'].map(category_map)

# Ordinal encode description bin
desc_map = {'Short': 0, 'Medium': 1, 'Long': 2}
df['description_bin_num'] = df['description_bin'].map(desc_map)

# One-hot encode sentiment label
df = pd.get_dummies(df, columns=['review_sentiment_label'], prefix='sentiment')

# Ordinal encode engagement
engagement_map = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3}
df['engagement_num'] = df['engagement'].map(engagement_map)

In [154]:
features = [
    'price',
    'number_available_in_stock',
    'number_of_reviews',
    'average_review_rating',
    'known_brand',
    'cross_sell_length',
    'description_length',
    'review_sentiment',  # numeric polarity
    'rating_bin_num',
    'category_risk_num',
    'description_bin_num',
    'engagement_num',
    # One-hot sentiment columns:
    'sentiment_Negative',
    'sentiment_Neutral',
    'sentiment_Positive'
]

X = df[features]
y = df['return_label']

In [155]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [156]:
# train the logistic regression
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [157]:
# Get probability of return
df['return_probability'] = model.predict_proba(X)[:, 1]

# Map to Low, Medium, High
df['return_risk'] = pd.cut(
    df['return_probability'],
    bins=[0, 0.3, 0.7, 1.0],
    labels=['Low', 'Medium', 'High']
)

In [158]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:,1]):.2f}")

              precision    recall  f1-score   support

           0       0.48      0.54      0.51      1006
           1       0.47      0.41      0.44       994

    accuracy                           0.48      2000
   macro avg       0.48      0.48      0.48      2000
weighted avg       0.48      0.48      0.48      2000

[[548 458]
 [582 412]]
AUC: 0.48


In [159]:
print(df[['uniq_id', 'return_probability', 'return_risk']].head(10))

                            uniq_id  return_probability return_risk
0  eac7efa5dbd3d667f26eb3d3ab504464            0.491082      Medium
1  b17540ef7e86e461d37f3ae58b7b72ac            0.483299      Medium
2  348f344247b0c1a935b1223072ef9d8a            0.534438      Medium
3  e12b92dbb8eaee78b22965d2a9bbbd9f            0.498545      Medium
4  e33a9adeed5f36840ccc227db4682a36            0.502118      Medium
5  cb34f0a84102c1ebc3ef6892d7444d36            0.491603      Medium
6  f74b562470571dfb689324adf236f82c            0.510452      Medium
7  87bbb472ef9d90dcef140a551665c929            0.503717      Medium
8  7e2aa2b4596a39ba852449718413d7cc            0.507747      Medium
9  5afbaf65680c9f378af5b3a3ae22427e            0.510751      Medium


In [160]:
output_dir = r'C:/Users/ADMIN/return_risk/data/processed_data'
os.makedirs(output_dir, exist_ok=True)

# Save compressed
gz_path = os.path.join(output_dir, 'processed_data_amazon.csv.gz')
df.to_csv(
    gz_path,
    index=False,
    compression='gzip'
)

# Decompress
csv_path = os.path.join(output_dir, 'processed_data_amazon.csv')
with gzip.open(gz_path, 'rb') as f_in:
    with open(csv_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# ✅ Check size of uncompressed CSV
print(f"File size: {os.path.getsize(csv_path) / (1024**2):.2f} MB")


File size: 39.42 MB


In [161]:
print(f"File size: {os.path.getsize(output_path) / (1024**2):.2f} MB")

File size: 39.42 MB
