In [None]:
pip install pandasql missingno pydantic-settings pandas_profiling plotly

In [None]:
pip install --upgrade pandas-profiling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandasql as ps
import missingno as msno
from pyspark.sql import SparkSession
# from pandas_profiling import ProfileReport
import plotly.graph_objects as go
# from pydantic_settings import BaseSettings

sns.set()
sns.set_palette('deep')
sns.set_context('notebook')
plt.subplots_adjust(left=0, right=2, top=0.9, bottom=0.1)  # Adjust the values as needed

SPARK = SparkSession.builder.appName('online_retail').master("local").getOrCreate()


def sql(df: pd.DataFrame, query: str, table_name = "online_retail", create_temp_view = False):
    """
    Create a table in SparkSQL.
    Execute a query
    """
    if create_temp_view:
        spark_df = SPARK.createDataFrame(df)
        spark_df.createOrReplaceTempView(table_name)

    return SPARK.sql(query)

def load_data(path = "../datasets/online_retail/Online_Retail_Data_Set.csv"):
    df = pd.read_csv(path, encoding = "ISO-8859-1") 
    return df


def pre_processing(df):

    # Changing dtype to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='mixed')

    # Remove transactions with negative quantites or negative prices
    df = df[df.Quantity > 0]
    df = df[df.UnitPrice > 0]

    # Replace missing CustomerIDs with 0
    df = df.replace(np.nan, 0)
    
    # Remove duplicates
    df = df[~df.duplicated()]


    # Using Spark to:
    #   1. Aggregate duplicated transactions with different quantites
    #   2. Add a profit column
    #   3. Remove transactions before 2011
    df["InvoiceDate"] = df["InvoiceDate"].astype(str)
    spark_df = SPARK.createDataFrame(df)
    spark_df.createOrReplaceTempView("spark_df")

    query = """
    select 
        aaa.*,
        round(aaa.quantity * aaa.UnitPrice, 2) as Profit
    from
    (
        select 
            InvoiceNo,
            StockCode,
            description,
            sum(quantity) as quantity,
            InvoiceDate,
            UnitPrice,
            CustomerID,
            Country
        from spark_df
        group by
            InvoiceNo,
            StockCode,
            CustomerID,
            description,
            InvoiceDate,
            UnitPrice,
            Country
    ) as aaa
    where
        aaa.InvoiceDate >= '2011-01-01'
    """
    spark_df = SPARK.sql(query)
    df = spark_df.toPandas()
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])


    return df


if __name__ == "__main__":
    #df = load_data()
    #print(df.columns)
    #clean_df = pre_processing(df)
    pass

# Data Quality Check and Cleaning

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("../datasets/online_retail/Online_Retail_Data_Set.csv", encoding = "ISO-8859-1") 
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [None]:
df.info()

In [None]:
df["Country"].describe()

In [4]:
df[["Quantity", "UnitPrice", "CustomerID"]].describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


- Notes:

- The quantity column is problematic. I need to remove transactions with Negative values
- Remove rows with negative unit price


## Data Types

In [None]:
df.dtypes

Notes:
- Change InvoiceDate to datetime

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='mixed')

In [None]:
df.dtypes

## Qunatity and UnitPrice columns

In [None]:
negative_quantity = df[df.Quantity <= 0]
negative_quantity

In [None]:
negative_quantity.Description.value_counts()

- I dont see a reason why these transactions would be legit, so I will remove them from the dataset.

In [None]:
df = df[df.Quantity > 0]

- Now lets check unitPrice

- Same logic, I will drop transactions with unit price lower than 0

In [None]:
df[df.UnitPrice <= 0]

In [None]:
df = df[df.UnitPrice > 0]

In [None]:
df[df.UnitPrice <= 0]

## Missing Values

In [None]:
df = load_data()
"""df = pre_processing(df)"""

In [None]:
df.isna().sum()

- Looks like there are 132186 rows with a missing customer id

In [None]:
df

In [None]:
df.describe()

- This is about 1 out of every 5 rows.
- I will try not to delete them

In [None]:
df[df.CustomerID.isna()].head()

- They look like legit transactions.
- I will not remove these transactions. This is too much.
- Instead I will replace the null customerIds with a 0. This will indicate that they are missing

In [None]:
df = df.replace(np.nan, 0)

## Duplicates

In [None]:
len(df)

In [None]:
len(df[df.duplicated()])

- looks like, out of 541909, there are 5268 duplicates. Lets dive deeper.

In [None]:
df[(df.InvoiceNo == '536409') & (df.StockCode == '22111')]

- Looks like the duplicates are essentially a copy of the same transaction. We can drop them.

In [None]:
len(df[df.Quantity > 1]) # 383058

In [None]:
df = df[~df.duplicated()]

- Now let's check if there are duplicates in the subset: [InvoiceNo, StockCode and CustomerID]

In [None]:
column_names = ["InvoiceNo", "StockCode" , "CustomerID", "InvoiceDate"]
df[df.duplicated(subset=column_names)].head()

In [None]:
df[(df.InvoiceNo == '536381') & (df.StockCode == '71270') & (df.CustomerID == 15311)]

- So this is wierd. There are two exact transactions, happened at the same time by the same person. But, with different quantities.

- This is actually problematic. I have no way of knowing if these are legit transactions. I need to make some assumptions about the data.

Assumptions:
- When a customer buys several products in a single purchase. Each will be registered as its own transaction.
- Customers that buy a batch of the same product(like 4 PHOTO CLIP LINE in the same transaction), it might be divided into several rows.

- Solution: Aggregate the quantities of these transactions

In [None]:
from pyspark.sql import SparkSession

df = load_data()
df = pre_processing(df)
df["InvoiceDate"] = df["InvoiceDate"].astype(str)

spark = SparkSession.builder.appName('online_retail').master("local").getOrCreate()

spark_df = spark.createDataFrame(df)

spark_df.createOrReplaceTempView("spark_df")

In [None]:
query = """
select 
    InvoiceNo,
    StockCode,
    description,
    sum(quantity) as quantity,
    InvoiceDate,
    UnitPrice,
    CustomerID,
    Country
from spark_df
group by
    InvoiceNo,
    StockCode,
    CustomerID,
    description,
    InvoiceDate,
    UnitPrice,
    Country
"""

spark_df = spark.sql(query)
spark_df.show(5)




In [None]:
df = spark_df.toPandas()
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

In [None]:
df.head()

- Let's check if there are duplicate transactions again

In [None]:
column_names = ["InvoiceNo", "StockCode" , "CustomerID", "InvoiceDate", "Country", "UnitPrice", "description"]
df[df.duplicated(subset=column_names)]

## Checking for other issues

In [None]:
df = load_data()
df = pre_processing(df)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

# EDA

In [None]:
df = load_data()
df = pre_processing(df)

In [None]:
df.head()

- Let's start with an overall report of the data

Notes:

- There are 3922 unique products. But there are 4026 descriptions. Meaning there are products with the same StockCode but different description. I will leave it as it is. The difference in the descirptions is because of commas, spaces etc. Not important.

- About 98% of the transactions occured in the second half of the InvoiceDate column values

- About 80% of the transactions occured in the UK

In [None]:
## Ghadi n desctivih it mkahdamach
#profile = ProfileReport(df, title="Data Report")
#profile

In [None]:
## 

# Creating a Dashboard

Lets load the data to csv file and upload it to Tableau

- I will add a column called profits

In [None]:
df = load_data()
df = pre_processing(df)
df.to_csv("clean_data.csv")

In [None]:
df

# Answering Business Questions

    Sales Analysis:
        What is the total revenue for each invoice?
        What is the total revenue for each customer?
        Which products have the highest total sales revenue?
        What is the overall sales trend over time?

    Product Analysis:
        Which products are the best-sellers in terms of quantity sold?
        What is the average unit price for each product?
        What is the distribution of products across different categories?
        Are there any products that are frequently bought together?

    Customer Analysis:
        Who are the top customers based on total spending?
        What is the distribution of customers by country?
        What is the average order quantity per customer?
        How often do customers make repeat purchases?

    Time Analysis:
        What are the busiest months in terms of sales volume?
        Is there a pattern in sales based on days of the week?
        How does sales performance vary by different times of the day?

    Geographical Analysis:
        Which countries contribute the most to sales revenue?
        Are there any regional preferences in product purchases?
        How does customer behavior differ across countries?

    Inventory Analysis:
        What is the stock movement for each product (quantity bought vs. quantity sold)?
        Are there any products with consistently low inventory turnover?
        How does the inventory turnover differ for different product categories?

    Price Analysis:
        What is the average unit price for products in different categories?
        Are there any pricing strategies that lead to increased sales?
        How does the price affect the quantity of products sold?

    Customer Segmentation:
        Can customers be grouped into segments based on their purchasing behavior?
        What are the characteristics of high-value customers?
        Are there any trends or patterns specific to different customer segments?

    Invoice Patterns:
        What is the average number of products per invoice?
        How frequently do customers make repeat purchases within a short time frame?

    Returns and Refunds:
        What is the frequency of product returns?
        Are there any specific products that are more prone to returns?
        How do returns impact overall revenue?

In [None]:
SPARK.stop()

In [None]:
SPARK = SparkSession.builder.appName('online_retail').master("local").getOrCreate()

In [None]:
# commet dialy
df = load_data()
df = pre_processing(df)

spark_df = SPARK.createDataFrame(df)

spark_df.createOrReplaceTempView("online_retail")

In [None]:
query = """
select *
from online_retail
limit 5
"""

results = SPARK.sql(query)
results.show()

##     Sales Analysis:

        What is the total revenue for each invoice?
        What is the total revenue for each customer?
        Which products have the highest total sales revenue?
        What is the overall sales trend over time?


### What is the total revenue for each invoice?

In [None]:
query = """
select 
    InvoiceNo,
    round(sum(Profit), 2) as InvoiceProfit,
    count(*) as num_of_transactions
from online_retail
group by
    InvoiceNo
order by sum(Profit) desc
"""

invoice_revenues = SPARK.sql(query).toPandas()

invoice_revenues.head()

In [None]:
sns.barplot(x="InvoiceNo", y="InvoiceProfit", data=invoice_revenues.head(10), palette="Blues_d")

plt.title("Top 10 Invoices by Revenue")
plt.xlabel("Invoice Number")
plt.ylabel("Revenue")

plt.xticks(rotation=45)

plt.show()

### What is the total revenue for each customer?

In [None]:
query = """
with cte_test as
(
    select 
        cast(case
            when CustomerID = 0 then null else CustomerID
        end as int) as CustomerID,
        round(sum(Profit), 2) as CustomerProfit,
        count(*) as num_of_transactions
    from online_retail
    group by
        CustomerID
    order by sum(Profit) desc
)
select *
from cte_test
where
    CustomerID is not null;
"""

customer_revenues = SPARK.sql(query).toPandas()

customer_revenues.head()

In [None]:
# bar plot sorted by profit
sns.barplot(x="CustomerID", y="CustomerProfit", data=customer_revenues.head(10), palette="Blues_d")

plt.title("Top 10 Customers by Revenue")

plt.xlabel("Customer ID")

plt.ylabel("Revenue")

plt.xticks(rotation = 45)
plt.show()

### Revenue Trend

In [None]:
new_df = df.copy()

new_df["InvoiceDate"] = new_df["InvoiceDate"].dt.date

new_df.sort_values(by="InvoiceDate", inplace=True)

new_df["Running_Sum"] = new_df["Profit"].cumsum()

new_df["Running_Sum"] = new_df["Running_Sum"].astype("float64")

new_df = new_df[['InvoiceDate', 'Profit', 'Running_Sum']]

In [None]:
new_df.head()

In [None]:
# line plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=new_df.InvoiceDate, y=new_df.Running_Sum,
                         
                            mode='lines',
                            name='lines'))

fig.update_layout(title='Cumulative Revenue',
                     xaxis_title='Date',
                        yaxis_title='Revenue')

fig.show()

## Product Analysis:
        Which products are the best-sellers in terms of quantity sold?
        What is the average unit price for each product?
        What is the distribution of products across different categories?
        Are there any products that are frequently bought together?

### Product Quantities

In [None]:
query = """
select 
    StockCode,
    description,
    count(StockCode) as products_sold
from online_retail
group by 
    StockCode,
    description
order by count(StockCode) desc
limit 10
"""

top10_most_sold_products = SPARK.sql(query)

top10_most_sold_products.show()

In [None]:
top10_most_sold_products = top10_most_sold_products.toPandas()

In [None]:

sns.barplot(data = top10_most_sold_products, x = "products_sold", y = "StockCode", palette ='Blues_r')

plt.title("Top 10 Products by Count")
plt.show()

### Average unit of price

In [None]:
print(f"Average Pirce of a Unit: {round(np.average(df.UnitPrice), 2)}")

### Products by Category

In [None]:
descriptions = df.description
descriptions = descriptions.str.split().to_list()

In [None]:
# product types
clothing_types = [
    "t-shirt",
    "jeans",
    "dress",
    "sweater",
    "shorts",
    "skirt",
    "jacket",
    "hoodie",
    "blouse",
    "suit",
    "pants",
    "coat",
    "shirt",
    "tank top",
    "leggings",
    "sweatshirt",
    "trousers",
    "blazer",
    "polo shirt",
    "jumpsuit",
]

# electronic types
electronic_types = [
    "smartphone",
    "laptop",
    "tablet",
    "television",
    "headphones",
    "smartwatch",
    "camera",
    "speaker",
    "gaming console",
    "router",
    "monitor",
    "keyboard",
    "mouse",
    "printer",
    "earbuds",
    "fitness tracker",
    "drone",
    "projector",
    "external hard drive",
    "bluetooth earphones",
]

food_types = [
    "pizza",
    "burger",
    "pasta",
    "sushi",
    "salad",
    "taco",
    "sandwich",
    "ramen",
    "steak",
    "ice cream",
    "chocolate",
    "cake",
    "coffee",
    "smoothie",
    "sushi",
    "doughnut",
    "burrito",
    "pancake",
    "croissant",
    "noodles",
    "candy"
]

In [None]:
descriptions_categories = {}

for row in descriptions:
    for word in row:
        word = word.lower()
        if word in clothing_types:
            descriptions_categories[tuple(row)] = "Clothes"
            break
        
        elif word in electronic_types:
            descriptions_categories[tuple(row)] = "Eletronics"
            break 
        
        elif word in food_types:
            descriptions_categories[tuple(row)] = "Food"
            break 

        else:
            descriptions_categories[tuple(row)] = "Unidentified"

In [None]:
new_df = df[["StockCode", "description"]]
new_df.head()

In [None]:
categories = []
    
for row in new_df.iterrows():
    categories.append(descriptions_categories[tuple(row[1].description.split())])

new_df["Category"] = categories

new_df.head()

In [None]:
new_df = SPARK.createDataFrame(new_df)

In [None]:
new_df.createOrReplaceTempView("categories")

In [None]:
query = """
select 
    category,
    count(*) as num_of_products
from categories
where
    category != 'Unidentified'
group by
    category
order by 
    count(*) desc
"""

new_df = SPARK.sql(query)

new_df.show()

##     Customer Analysis:
        Who are the top customers based on total spending?
        What is the distribution of customers by country?
        What is the average order quantity per customer?
        How often do customers make repeat purchases?

In [None]:
df = load_data()
df = pre_processing(df)

spark_df = SPARK.createDataFrame(df).repartition(5)

In [None]:
spark_df = SPARK.createDataFrame(df)

spark_df.createOrReplaceTempView("online_retail")

### Who are the top customers based on total spending?

In [None]:
query = """
select 
    CustomerID,    
    round(sum(Profit), 2) as Total_Profit
from online_retail
where
    CustomerID != 0
group by
    CustomerID
order by
    round(sum(Profit), 2) desc
limit 10
"""

top10_customers = SPARK.sql(query).toPandas()

top10_customers.head()

In [None]:
sns.barplot(data = top10_customers, x = "CustomerID", y = "Total_Profit", palette ='Blues_r', order = top10_customers.sort_values("Total_Profit", ascending = False).CustomerID)

plt.title("Top 10 Customers by Profit")
plt.ylabel("Profit")
plt.xticks(rotation = 45)

plt.show()

### What is the distribution of customers by country?


In [None]:
query = """
select 
    country,    
    count(country) as country_count
from online_retail
group by
    country
order by
    count(country) desc
limit 10
"""

top10_countries = SPARK.sql(query).toPandas()

top10_countries.head()

In [None]:
sns.catplot(data = top10_countries, x = "country_count", y = "country", palette ='Blues_r', kind = "bar")

plt.title("Top 10 Countries by Count")
plt.ylabel("Country")
plt.xticks(rotation = 45)

plt.subplots_adjust(left = 0, right = 1)

plt.show()


### What is the average order quantity per customer?

In [None]:
print(f"Average Quantity per Order: {round(np.average(df.quantity), 2)}")

### How often do customers make repeat purchases?

In [None]:
df = load_data()
df = pre_processing(df)

spark_df = SPARK.createDataFrame(df)

spark_df.createOrReplaceTempView("online_retail")

In [None]:
query = """
with cte_temp as
(

    select 
        CustomerID,
        round(avg(diff), 2) as avg_days_between_purchase,
        count(CustomerID) as transaction_amount
    from
    (
        select 
            CustomerID,
            date_diff(InvoiceDate, lag1) as diff
        from
        (
            select 
                cast(CustomerID as integer) as CustomerID,
                InvoiceDate,
                lag(InvoiceDate, 1) over (partition by CustomerID order by InvoiceDate) as lag1
            from
            (
                select 
                    CustomerID,
                    InvoiceDate
                from online_retail
                where
                    CustomerID != 0
                group by
                    CustomerID,
                    InvoiceDate
                order by
                    CustomerID desc, InvoiceDate desc
            )
        ) 
    ) 
    group by CustomerID
)

select *
from cte_temp
where avg_days_between_purchase > 0
order by avg_days_between_purchase
"""

most_frequent_customers = SPARK.sql(query)

most_frequent_customers.show()


In [None]:
most_frequent_customers = most_frequent_customers.toPandas()

In [None]:
most_frequent_customers = most_frequent_customers[(most_frequent_customers["avg_days_between_purchase"] < 50) & (most_frequent_customers["transaction_amount"] >= 5)].sort_values("CustomerID")

sns.relplot(data = most_frequent_customers, 
            y = "avg_days_between_purchase", 
            x = "transaction_amount", palette ='Reds', 
            kind = "scatter", hue = "transaction_amount", size = "transaction_amount", sizes = (10, 250), alpha = 0.8)

# Annotate most frequent customers
plt.annotate("Dead Zone", xy=[10,50])

plt.annotate("Whales", xy=[125,5])

plt.title("Most Frequent Customers")

plt.xlabel("Number of Transactions")

plt.ylabel("Average Days Between Purchase")

plt.show()

In [None]:
print(f"The average number of days it takes a customer to buy another product is: {round(np.average(most_frequent_customers.avg_days_between_purchase), 2)}")

##    Time Analysis:
        What are the busiest months in terms of sales volume?
        Is there a pattern in sales based on days of the week?
        How does sales performance vary by different times of the day?

In [None]:
df = load_data()
df = pre_processing(df)

spark_df = SPARK.createDataFrame(df)
spark_df.createOrReplaceTempView("online_retail")

### What are the busiest months in terms of sales volume?

In [None]:
query = """
select 
    month(InvoiceDate) as month,
    round(sum(profit), 2) as Total_Profit,
    count(*) as num_of_transactions
from online_retail
group by
    month(InvoiceDate)
order by
    month(InvoiceDate)
"""

profits_per_month = SPARK.sql(query)

profits_per_month.show()

In [None]:
profits_per_month = profits_per_month.toPandas()

In [None]:
profits_per_month.to_csv("profits_per_month.csv")

In [None]:
sns.barplot(data = profits_per_month, x = "month", y = "Total_Profit", hue="month", palette="Blues_r", order = profits_per_month.sort_values("Total_Profit", ascending = False).month , legend=False)
sns.barplot(data = profits_per_month, x = "month", y = "num_of_transactions", hue="month", palette="Reds", order = profits_per_month.sort_values("num_of_transactions", ascending = False).month, legend=False)

plt.title("Total Profit + num of transactions per Month")

plt.xlabel("Month")

plt.ylabel("Total Profit")

plt.show()

### Is there a pattern in sales based on days of the week?

In [None]:
query = """
select 
    cast(InvoiceDate as date) as InvoiceDate,
    round(sum(profit), 2) as daily_profit
from online_retail
group by
    cast(InvoiceDate as date)
order by
    InvoiceDate
"""

profits_per_day = SPARK.sql(query)

profits_per_day.show()

In [None]:
profits_per_day = profits_per_day.toPandas()

In [None]:
profits_per_day.to_csv("profits_per_day.csv")

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=profits_per_day.InvoiceDate, y=profits_per_day.daily_profit,
                         
                            mode='lines',
                            name='lines'))

fig.update_layout(title='Daily Revenue',

                        xaxis_title='Date',
                        yaxis_title='Revenue')

fig.show()

In [None]:
query = """
select 
    cast(InvoiceDate as date) as InvoiceDate,
    day(cast(InvoiceDate as date)) as day,
    month(cast(InvoiceDate as date)) as month,
    round(sum(profit), 2) as daily_profit,
    count(*) as num_of_transactions
from online_retail
group by
    cast(InvoiceDate as date),
    day(cast(InvoiceDate as date)),
    month(cast(InvoiceDate as date))
order by
    InvoiceDate
"""

revenue_density = SPARK.sql(query)

revenue_density.show()

In [None]:
revenue_density = revenue_density.toPandas()

In [None]:
revenue_density.to_csv("revenue_density.csv")

In [None]:
sns.kdeplot(data = revenue_density, x = "daily_profit", fill = True, alpha = 0.5)

plt.title("Daily Revenue Distribution")

plt.xlabel("Daily Revenue")

plt.ylabel("Density")

plt.show()

In [None]:
sns.kdeplot(data = revenue_density, x = "num_of_transactions", fill = True, alpha = 0.5)

plt.title("Daily Transactions Distribution")

plt.xlabel("Daily Transactions")

plt.ylabel("Density")

plt.show()