**Customer Purchases Data**

Dataset Columns:
* Customer ID
* First Name
* Last Name
* Email Address
* Phone Number
* Product ID
* Purchase Date
* Quantity
* Price per Item
* Total Purchase Value
* Discount Applied (Yes/No)

Features:
* Simulate random customer purchases for a store.
* Calculate total purchase value based on quantity and price per item.
* Apply random discounts to simulate promotional offers.

Manipulation Ideas:
* Visualize total purchases per customer over time.
* Calculate the total revenue and average purchase value.
* Identify frequent shoppers (customers who purchase often).
* Compare sales with and without discounts.

In [144]:
# import required libraries
import pandas as pd
from faker import Faker
import datetime
import numpy as np

In [145]:
# Create faker object
fake = Faker()


# List of project category
product_category = [
    "Groceries",
    "Clothing & Apparel",
    "Electronics",
    "Home & Kitchen",
    "Sports & Outdoors",
    "Office Supplies",
    "Automotive",
    "Video Games & Consoles"
]


# Function to determine quantity based on product category
def get_quantity_by_category(category):
    if category == "Groceries":
        return np.random.randint(1, 10)
    elif category == "Clothing & Apparel":
        return np.random.randint(1, 5)
    else:
        return np.random.randint(1, 3)


# Function to determine price based on product category
def get_price_by_category(category):
    if category == "Groceries":
        return round(np.random.uniform(1, 50), 2)
    elif category == "Clothing & Apparel":
        return round(np.random.uniform(20, 200), 2)
    elif category == "Electronics":
        return round(np.random.uniform(50, 1000), 2)
    elif category == "Home & Kitchen":
        return round(np.random.uniform(20, 500), 2)
    elif category == "Sports & Outdoors":
        return round(np.random.uniform(20, 300), 2)
    elif category == "Office Supplies":
        return round(np.random.uniform(5, 100), 2)
    elif category == "Automotive":
        return round(np.random.uniform(30, 1000), 2)
    elif category == "Video Games & Consoles":
        return round(np.random.uniform(10, 500), 2)
    else:
        # Default price range for any unspecified category
        return round(np.random.uniform(1, 50), 2)


# Function to decide if a discount is applied based on purchase details
def apply_discount(quantity, price_per_item, category):
    # Calculate the original total purchase value
    total_value = quantity * price_per_item
    discount_chance = {
        "Groceries": 0.1,
        "Electronics": 0.5,
        "Clothing & Apparel": 0.3,
        "Books & Stationery": 0.2
    }
    # Get the base chance for the given category (default 20%)
    base_chance = discount_chance.get(category, 0.2)
    if total_value > 50:
        base_chance += 0.2

    # Decide if a discount is applied
    is_discount_applied = np.random.random() < base_chance

    # Set discount percentage and calculate amounts
    discount_percentage = np.random.uniform(5, 30) if is_discount_applied else 0
    discount_amount = round(total_value * (discount_percentage / 100), 2)
    final_price = round(total_value - discount_amount, 2)

    return {
        "discount_applied": is_discount_applied,
        "discount_percentage": discount_percentage,
        "discount_amount": discount_amount,
        "final_price": final_price
    }


In [146]:
# Generate the base data
customer_id = [f"CT{np.random.randint(0, 9999999):07d}" for ci in range(300)]
first_name = [fake.first_name() for fn in range(300)]
last_name = [fake.last_name() for ln in range(300)]
email_address = [fake.email() for ea in range(300)]
product_category = [np.random.choice(product_categories) for pc in range(300)]
product_id = [f"PR{np.random.randint(0, 999999):06d}" for pi in range(300)]
purchase_date = [fake.date_this_century() for pa in range(300)]
quantity = [get_quantity_by_category(category) for category in product_category]
price = [get_price_by_category(category) for category in product_category]


# Calculate dependent columns
original_total_price = [quantity * price for quantity, price in zip(quantity, price)]
discount_detail = [
    apply_discount(quantity, price, category)
    for quantity, price, category in zip(quantity, price, product_category)
]

# Split discount details into two columns: Discount Applied (£) and Final Price After Discount (£)
discount_applied_value = [discount["discount_amount"] for discount in discount_detail]
final_price_after_discount = [discount["final_price"] for discount in discount_detail]

# Create the data dictionary
data = {
    "Customer ID": customer_id,
    "First Name": first_name,
    "Last Name": last_name,
    "Email Address": email_address,
    "Product ID": product_id,
    "Purchase Date": purchase_date,
    "Product Category": product_category,
    "Quantity": quantity,
    "Price Per Item £": price,
    "Original Total Price £": original_total_price,
    "Discount Applied £": discount_applied_value,
    "Final Price After Discount £": final_price_after_discount,
}


In [147]:
# convert data into pandas dataframe
df = pd.DataFrame(data)

In [148]:
# display the first 
print(df.head(10))

  Customer ID First Name Last Name             Email Address Product ID  \
0   CT9757586    William    Bishop        sevans@example.net   PR948511   
1   CT8644551      David  Mccarthy       qfisher@example.com   PR496984   
2   CT7204886   Benjamin      Hill       iortega@example.com   PR144071   
3   CT4814139     Jeremy   Higgins  claytonriley@example.org   PR866499   
4   CT4432476     Joshua    Franco       ehinton@example.net   PR767684   
5   CT7750886      Robin    Valdez      rcabrera@example.net   PR186325   
6   CT7241907     Krista    Turner  bowmansteven@example.com   PR271887   
7   CT0913186     Angela    Molina     brownmark@example.com   PR561951   
8   CT6560555    Valerie    Duncan       kevin02@example.org   PR042093   
9   CT1081806       Mark      King  stevensjames@example.org   PR894072   

  Purchase Date        Product Category  Quantity  Price Per Item £  \
0    2024-10-15             Electronics         1            626.09   
1    2016-09-02              Aut

In [149]:
# save the DataFrame to a CSV file named 'customer-data.csv' with the index included
df.to_csv("customer-date.csv", index=True)