In [2]:
import pandas as pd
import requests
import os

# Load dataset into a pandas DataFrame and verify if the file exists
file_path = 'Online Retail.xlsx'
if not os.path.exists(file_path):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
    response = requests.get(url)
    with open(file_path, 'wb') as f:
        f.write(response.content)

# Read the Excel file (no encoding parameter needed)
df = pd.read_excel(file_path)
df.head()



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
# Create the Data directory in the user directory
user_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(user_dir, "data1")

# Create the directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Save the dataset as 'online retail' in the Data directory
output_path = os.path.join(data_dir, "online retail.xlsx")
df.to_excel(output_path, index=False)

print(f"Dataset saved as 'online retail.xlsx' in {data_dir}")

Dataset saved as 'online retail.xlsx' in C:\Users\user\data1


In [None]:
# Data Cleaning and Transformation
df = df.dropna(subset=['CustomerID', 'InvoiceNo']) # Removing rows with missing variables

# Remove duplicates
df = df.drop_duplicates()

# Remove cancelled orders (those with InvoiceNo starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

#  Conversion of the Basket Format from the cleaned dataset
basket = (
    df.groupby(['InvoiceNo', 'Description'])['Quantity']
    .sum()  # In case same item appears multiple times in same invoice
    .unstack(fill_value=0)  # Pivot: InvoiceNo as rows, Description as columns
    .applymap(lambda x: 1 if x > 0 else 0)  # Binarize: 1 = item in basket, 0 = not
)


  .applymap(lambda x: 1 if x > 0 else 0)  # Binarize: 1 = item in basket, 0 = not


In [8]:
# Saving the data in the 
output_path = os.path.join(data_dir, "online_retail_basket.csv")
basket.to_csv(output_path)
print(f"Clean, transaction-based basket dataset saved to: {output_path}")
print(f"Shape of basket: {basket.shape}")

Clean, transaction-based basket dataset saved to: C:\Users\user\data1\online_retail_basket.csv
Shape of basket: (18536, 3877)
