In [1]:
# Step 2: Import required libraries

import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)


In [2]:
# Step 3: Define parameters

stores = ['SuperValu', 'Spar']

locations = [
    'Dublin', 'Cork', 'Galway', 'Limerick', 'Waterford',
    'Belfast', 'London', 'Manchester', 'Liverpool', 'Birmingham'
]

age_groups = [
    '18-25', '26-35', '36-50', '51-65', '65+'
]

income_groups = [
    'Low', 'Middle', 'High'
]

years = list(range(2018, 2026))


In [3]:
# Step 4: Create dataset

data = []

for i in range(500):
    
    store = random.choice(stores)
    location = random.choice(locations)
    year = random.choice(years)
    
    # Generate sales
    sales = np.random.normal(500000, 100000)
    
    # Profit margin depends on store
    if store == 'SuperValu':
        profit_margin = np.random.normal(0.025, 0.005)  # 2.5%
    else:
        profit_margin = np.random.normal(0.03, 0.007)   # 3%
    
    profit = sales * profit_margin
    
    customers = int(np.random.normal(20000, 5000))
    
    age_group = random.choice(age_groups)
    income_group = random.choice(income_groups)
    
    data.append([
        store,
        location,
        year,
        round(sales, 2),
        round(profit_margin, 4),
        round(profit, 2),
        customers,
        age_group,
        income_group
    ])

columns = [
    'Store',
    'Location',
    'Year',
    'Sales (€)',
    'Profit Margin',
    'Profit (€)',
    'Customers',
    'Age Group',
    'Income Group'
]

df = pd.DataFrame(data, columns=columns)

df.head()


Unnamed: 0,Store,Location,Year,Sales (€),Profit Margin,Profit (€),Customers,Age Group,Income Group
0,Spar,Birmingham,2025,549671.42,0.029,15958.14,23238,26-35,High
1,SuperValu,Cork,2023,652302.99,0.0238,15543.88,18829,65+,High
2,Spar,Birmingham,2023,657921.28,0.0354,23272.02,17652,65+,High
3,SuperValu,London,2020,554256.0,0.0227,12572.14,17671,51-65,High
4,SuperValu,Belfast,2023,524196.23,0.0154,8090.23,11375,65+,High


In [4]:
# Step 5: Dataset info

df.info()

df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Store          500 non-null    object 
 1   Location       500 non-null    object 
 2   Year           500 non-null    int64  
 3   Sales (€)      500 non-null    float64
 4   Profit Margin  500 non-null    float64
 5   Profit (€)     500 non-null    float64
 6   Customers      500 non-null    int64  
 7   Age Group      500 non-null    object 
 8   Income Group   500 non-null    object 
dtypes: float64(3), int64(2), object(4)
memory usage: 35.3+ KB


Unnamed: 0,Year,Sales (€),Profit Margin,Profit (€),Customers
count,500.0,500.0,500.0,500.0,500.0
mean,2021.478,511458.18738,0.027257,13918.65262,20301.786
std,2.300457,93804.788683,0.00635,4138.404122,5141.351548
min,2018.0,210374.46,0.0073,3612.04,5757.0
25%,2019.0,444703.2875,0.022675,10972.005,17029.25
50%,2021.0,512739.885,0.02665,13221.75,20345.5
75%,2023.0,566664.245,0.0315,16317.74,23543.0
max,2025.0,757970.93,0.0482,30063.82,39263.0


In [5]:
# Step 6: Add new columns

df['Revenue per Customer'] = df['Sales (€)'] / df['Customers']

df['Profit per Customer'] = df['Profit (€)'] / df['Customers']

df.head()


Unnamed: 0,Store,Location,Year,Sales (€),Profit Margin,Profit (€),Customers,Age Group,Income Group,Revenue per Customer,Profit per Customer
0,Spar,Birmingham,2025,549671.42,0.029,15958.14,23238,26-35,High,23.65399,0.686726
1,SuperValu,Cork,2023,652302.99,0.0238,15543.88,18829,65+,High,34.643528,0.825529
2,Spar,Birmingham,2023,657921.28,0.0354,23272.02,17652,65+,High,37.27177,1.318379
3,SuperValu,London,2020,554256.0,0.0227,12572.14,17671,51-65,High,31.365288,0.711456
4,SuperValu,Belfast,2023,524196.23,0.0154,8090.23,11375,65+,High,46.083185,0.711229


In [6]:
# Step 7: Group by store

profit_by_store = df.groupby('Store')['Profit Margin'].mean()

profit_by_store


Store
Spar         0.029548
SuperValu    0.025038
Name: Profit Margin, dtype: float64

In [7]:
# Step 8: Sales by age group

sales_by_age = df.groupby('Age Group')['Sales (€)'].mean()

sales_by_income = df.groupby('Income Group')['Sales (€)'].mean()

sales_by_age, sales_by_income


(Age Group
 18-25    512884.664340
 26-35    508774.723861
 36-50    509127.653152
 51-65    518535.894021
 65+      508070.637981
 Name: Sales (€), dtype: float64,
 Income Group
 High      517155.565724
 Low       511726.220559
 Middle    506050.024970
 Name: Sales (€), dtype: float64)

In [8]:
# Step 9: Save dataset

df.to_csv('supervalu_spar_sales_dataset.csv', index=False)

print("Dataset saved successfully")


Dataset saved successfully
