# Import Dependencies and Dataset

In [1]:
#import dependencies
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np

In [None]:
# fetch dataset 
online_retail = fetch_ucirepo(id=352) 
  
# data (as pandas dataframes) 
X = online_retail.data.features
y=online_retail.data.ids

# Initial Analysis

In [None]:
# metadata 
#online_retail.metadata

In [None]:
#Variable information
pd.set_option('display.max_colwidth', None)
online_retail.variables

In [None]:
#Combine IDs and Features
online_retail_trans = pd.concat([X, y], axis=1)

In [None]:
#Look at first five rows 
online_retail_trans.head()

In [None]:
#Rows and column count in dataset
num_rows, num_columns = online_retail_trans.shape
print(f"The number of rows in the dataset is {num_rows} and the number of columns is {num_columns}")


# Data Analysis and Cleanup


In [None]:
#Metadata says that rows where InvoiceNo starts with C are cancelled orders. Create two datasets, one with orders and one with cancelled orders

# Orders Dataset
orders_df = online_retail_trans[~online_retail_trans['InvoiceNo'].str.startswith('C')].copy()
#Cancelled Orders Dataset
cancelled_orders_df = online_retail_trans[online_retail_trans['InvoiceNo'].str.startswith('C')].copy()

# Display the filtered DataFrame
print(orders_df.shape)
print(cancelled_orders_df.shape)

In [None]:
orders_df.head()

In [None]:
#Descriptive statistics
orders_df.describe(include='all')


# Assessing Null Values / Errors

In [None]:
#From the descriptive stats, some of the quantity and unit price amounts are less than 0
#Remove rows with that have quantity and unit price that is 0 or less
filtered_df = orders_df[(orders_df['Quantity'] >= 0) | (orders_df['UnitPrice'] >= 0)]

#filtered_df now contains only rows where both Quantity and UnitPrice are 0 or more

In [None]:
#Find null values
orders_df.isnull().sum()

In [None]:
#After some analysis, noticed that some of the StockIds have upper and lowercase values. Counting the unique stockIDs before fixing the issue
unique_stock_ids_count = orders_df['StockCode'].nunique()
print(f'Number of unique Stock IDs: {unique_stock_ids_count}')

In [None]:
# Convert all letters in StockCode to uppercase
orders_df['StockCode'] = orders_df['StockCode'].str.upper()


In [None]:
#Unique StockIDs
count_ids2 = orders_df['StockCode'].nunique()
print(f'Number of unique Stock IDs: {count_ids2}')


In [None]:
#Replace typos and null values in description column with most popular description by StockCode

# Find the most popular description for each StockCode
most_popular_descriptions = orders_df.groupby('StockCode')['Description'].agg(lambda x: x.mode().iat[0] if not x.mode().empty else "No Description")

# Create a dictionary to map StockCode to the most popular description
description_mapping = most_popular_descriptions.to_dict()

# Fill in descriptions based on the most popular description for their StockCode
orders_df['Description'] = orders_df.apply(lambda x: description_mapping[x['StockCode']] if pd.isnull(x['Description']) or x['Description'] not in description_mapping.values() else x['Description'], axis=1)


In [None]:
#Validate one description per stock ID
unique_stock_ids_count = orders_df['StockCode'].nunique()
print(f'Number of unique Stock IDs: {unique_stock_ids_count}')

unique_descriptions_count = orders_df['Description'].nunique()
print(f'Number of unique descriptions: {unique_descriptions_count}')

In [None]:
# Group by Description and aggregate StockCodes into lists
grouped = orders_df.groupby('Description')['StockCode'].unique()

# Filter groups where the list of StockCodes has more than one element
duplicates = grouped[grouped.apply(len) > 1]

# duplicates now contains the descriptions along with the StockCodes that share those descriptions


In [None]:
negative_price_df = orders_df[orders_df['Quantity'] < 0]

In [None]:
negative_price_df

In [None]:
#######All orders with a blank customer ID have them blank for the entire invoice. Instead will remove rows where customerId is null
orders_df_cleaned= orders_df.dropna()

In [None]:
orders_df_cleaned.shape

## Assessing Descriptions for typos

In [None]:
#Checking to see if there are still any null values
orders_df_cleaned.isnull().sum()

In [None]:
# Identify the most popular Description for each StockCode
# This involves grouping by StockCode and finding the mode (most frequent) Description
popular_descriptions = orders_df_cleaned.groupby('StockCode')['Description'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()

# Step 2: Merge this information back with the original DataFrame
# This will allow us to fill in all descriptions with the most popular description per StockCode
orders_df_filled = orders_df_cleaned.merge(popular_descriptions, on='StockCode', suffixes=('', '_most_popular'))

# Replace the Description with the most popular description
orders_df_filled['Description'] = orders_df_filled['Description_most_popular']
orders_df_filled.drop('Description_most_popular', axis=1, inplace=True)

# Display the DataFrame after filling in descriptions
print("\nDataFrame after filling in descriptions with the most popular ones:")
print(orders_df_filled)


In [None]:
unique_stock_ids_count = orders_df_filled['StockCode'].nunique()
print(f'Number of unique Stock IDs: {unique_stock_ids_count}')

unique_descriptions_count = orders_df_filled['Description'].nunique()
print(f'Number of unique descriptions: {unique_descriptions_count}')

In [None]:
unique_stock_ids_count = orders_df_filled['StockCode'].nunique()
print(f'Number of unique Stock IDs: {unique_stock_ids_count}')

unique_descriptions_count = orders_df_filled['Description'].nunique()
print(f'Number of unique descriptions: {unique_descriptions_count}')

In [None]:
orders_df_filled

In [None]:
# Step 1: Group by 'StockCode' and filter those groups which have more than one unique 'Description'
groups_with_varied_descriptions = orders_df.groupby('StockCode').filter(lambda x: x['Description'].nunique() > 1)

# Step 2: Sort the result by 'StockCode' to easily visualize the discrepancies
sorted_varied_descriptions = groups_with_varied_descriptions.sort_values(by=['StockCode', 'Description'])

# Display the rows with the same 'StockCode' but different 'Descriptions'
print("\nRows with the same 'StockCode' but different 'Descriptions':")
print(sorted_varied_descriptions)

In [None]:
unique_descriptions_count = df_cleaned['Description'].nunique()
print(f'Number of unique descriptions: {unique_descriptions_count}')


In [None]:
#Drop rows with NA
df_cleaned = orders_df.dropna(subset=['CustomerID'].isnull())

In [None]:
print(filtered_df.shape)
print(df_cleaned.shape)

In [None]:
# Ensure 'InvoiceDate' is in datetime format
df_cleaned['InvoiceDate'] = pd.to_datetime(df_cleaned['InvoiceDate'])

In [None]:
print(df_cleaned['InvoiceDate'].dtype)

In [None]:
# Create separate date and time columns
df_cleaned['InvoiceDate_only'] = df_cleaned['InvoiceDate'].dt.date
df_cleaned['InvoiceTime_only'] = df_cleaned['InvoiceDate'].dt.time

In [None]:
#Change invoicedate_only to datetime
df_cleaned['InvoiceDate_only'] = pd.to_datetime(df_cleaned['InvoiceDate_only'])

# Create 'Year' and 'Month' columns 
df_cleaned['Year'] = df_cleaned['InvoiceDate_only'].dt.year
df_cleaned['Month'] = df_cleaned['InvoiceDate_only'].dt.month


In [None]:
# Adding in TotalPrice Column using .loc for explicit indexing
df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * df_cleaned['Quantity']


In [None]:
df_cleaned.shape


In [None]:
df_cleaned.head()

In [None]:
# Identify categorical columns
categorical_columns = df_cleaned.select_dtypes(include=['object', 'category']).columns

# Get frequency counts for each categorical column
frequencies = {}
for column in categorical_columns:
    frequencies[column] = df_cleaned[column].value_counts()

# Display the frequency counts
for column, freq in frequencies.items():
    print(f"Frequency for {column}:\n{freq}\n")

# Show Sales by Top 5 Customers

In [None]:
# Sum sales by customer
sales_by_customer = df_cleaned.groupby('CustomerID')['TotalPrice'].sum().sort_values(ascending=False)


In [None]:
# Get the top 5 customers
top_5_customers = sales_by_customer.head(5).index.tolist()


In [None]:
# Filter the dataset for top 5 customers
top_5_customers_data = df_cleaned[df_cleaned['CustomerID'].isin(top_5_customers)]


In [None]:
#Convert InvoiceDate to datetime
top_5_customers_data.loc[:, 'InvoiceDate_only'] = pd.to_datetime(top_5_customers_data['InvoiceDate_only'])
#Grouping and summation
monthly_sales_top_5 = top_5_customers_data.groupby(['CustomerID', pd.Grouper(key='InvoiceDate_only', freq='ME')])['TotalPrice'].sum().reset_index()



In [None]:
#Plot Trend lines
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

# Plotting sales trends for the top 5 customers
for customer in top_5_customers:
    customer_data = monthly_sales_top_5[monthly_sales_top_5['CustomerID'] == customer]
    plt.plot(customer_data['InvoiceDate_only'], customer_data['TotalPrice'], label=customer)

plt.title('Sales Trends for Top 5 Customers')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend(title='CustomerID')
plt.xticks(rotation=45)  # Rotate date labels for better readability
plt.show()


# Clean the Description Field

In [None]:
unique_descriptions_count = df_cleaned['Description'].nunique()
print(f'Number of unique descriptions: {unique_descriptions_count}')


In [None]:
unique_stock_ids_count = df_cleaned['StockCode'].nunique()
print(f'Number of unique Stock IDs: {unique_stock_ids_count}')


In [None]:
#Number of unique descriptions and unique stock ID's do not match!!!



In [None]:
# Step 1 & 2: Group by 'StockCode' and filter
# This creates a DataFrame of groups with more than one unique 'Description'
df_filtered = df_cleaned.groupby('StockCode').filter(lambda x: x['Description'].nunique() > 1)

# Step 3: Sort the resulting DataFrame by 'StockCode' (and 'Description' if you want)
df_sorted = df_filtered.sort_values(by=['StockCode', 'Description'])

print(df_sorted[['StockCode', 'Description']])
