# Analysis Phase

In [None]:
import pandas as pd 

df = pd.read_csv('../../cleaned_data/cleaned_transactions.csv')

## Golden Table calculation

In [9]:
summary = (
    df.groupby(['LIFESTAGE', 'PREMIUM_CUSTOMER']).agg(
            TOTAL_VALUE=('TOT_SALES', 'sum'),
            TOTAL_VOLUME=('PROD_QTY', 'sum'),
            TOTAL_TRANSACTIONS=('TXN_ID', 'nunique'),
            TOTAL_CUSTOMERS=('LYLTY_CARD_NBR', 'nunique')
        ).reset_index()
)

In [10]:
summary = summary.sort_values('TOTAL_VALUE', ascending=False)

summary['AVG_EXP_PER_TRANSACTION'] = summary['TOTAL_VALUE'] / summary['TOTAL_TRANSACTIONS']
summary['AVG_PRICE_PER_UNIT_SALE'] = summary['TOTAL_VALUE'] / summary['TOTAL_VOLUME']
summary['AVG_PURCHASE_FREQ'] = summary['TOTAL_TRANSACTIONS'] / summary['TOTAL_CUSTOMERS']


summary.head(10)

Unnamed: 0,LIFESTAGE,PREMIUM_CUSTOMER,TOTAL_VALUE,TOTAL_VOLUME,TOTAL_TRANSACTIONS,TOTAL_CUSTOMERS,AVG_EXP_PER_TRANSACTION,AVG_PRICE_PER_UNIT_SALE,AVG_PURCHASE_FREQ
6,OLDER FAMILIES,Budget,156096.75,41558,21173,4606,7.372444,3.756118,4.59683
19,YOUNG SINGLES/COUPLES,Mainstream,147244.2,36095,19411,7908,7.585606,4.079352,2.454603
13,RETIREES,Mainstream,144677.55,37488,19780,6345,7.314335,3.859303,3.117415
15,YOUNG FAMILIES,Budget,129151.15,34264,17517,3951,7.372903,3.769296,4.433561
9,OLDER SINGLES/COUPLES,Budget,127279.8,32670,16970,4839,7.500283,3.895923,3.506923
10,OLDER SINGLES/COUPLES,Mainstream,124089.5,32392,16846,4854,7.366111,3.830869,3.47054
11,OLDER SINGLES/COUPLES,Premium,123141.55,31543,16394,4679,7.511379,3.903926,3.50374
12,RETIREES,Budget,105586.1,26805,14079,4376,7.499545,3.939045,3.217322
7,OLDER FAMILIES,Mainstream,96059.95,25668,13059,2782,7.355843,3.742401,4.694105
14,RETIREES,Premium,91013.25,23157,12121,3808,7.508725,3.930269,3.183036


## Brand Analysis


### Revenue Generation per Segment
> Q: Which brand brings the most revenue for the retail for each segment?

In [11]:
df['SEGMENT_NAME'] = df['LIFESTAGE'] + ' - ' + df['PREMIUM_CUSTOMER']

brand_sales = df.groupby(['SEGMENT_NAME', 'BRAND'])['TOT_SALES'].sum().reset_index()

brand_sales.head(10)

Unnamed: 0,SEGMENT_NAME,BRAND,TOT_SALES
0,MIDAGE SINGLES/COUPLES - Budget,Burger,193.2
1,MIDAGE SINGLES/COUPLES - Budget,CCs,430.5
2,MIDAGE SINGLES/COUPLES - Budget,Cheetos,337.6
3,MIDAGE SINGLES/COUPLES - Budget,Cheezels,612.3
4,MIDAGE SINGLES/COUPLES - Budget,Cobs,1311.0
5,MIDAGE SINGLES/COUPLES - Budget,Doritos,4291.0
6,MIDAGE SINGLES/COUPLES - Budget,French,153.0
7,MIDAGE SINGLES/COUPLES - Budget,GrnWves,880.7
8,MIDAGE SINGLES/COUPLES - Budget,Infuzions,1957.8
9,MIDAGE SINGLES/COUPLES - Budget,Kettle,6736.2


In [12]:
high_value_segments = [
    'OLDER FAMILIES - Budget',
    'YOUNG SINGLES/COUPLES - Mainstream',
    'RETIREES - Mainstream',
    'YOUNG FAMILIES - Budget'
]

brand_sales_high_value = brand_sales[brand_sales['SEGMENT_NAME'].isin(high_value_segments)]

brand_sales_high_value

Unnamed: 0,SEGMENT_NAME,BRAND,TOT_SALES
120,OLDER FAMILIES - Budget,Burger,692.3
121,OLDER FAMILIES - Budget,CCs,1852.2
122,OLDER FAMILIES - Budget,Cheetos,1687.4
123,OLDER FAMILIES - Budget,Cheezels,3723.0
124,OLDER FAMILIES - Budget,Cobs,5658.2
...,...,...,...
395,YOUNG SINGLES/COUPLES - Mainstream,Thins,7217.1
396,YOUNG SINGLES/COUPLES - Mainstream,Tostitos,7238.0
397,YOUNG SINGLES/COUPLES - Mainstream,Twisties,7539.8
398,YOUNG SINGLES/COUPLES - Mainstream,Tyrrells,4800.6


In [13]:
brand_sales_high_value = brand_sales_high_value.sort_values(['SEGMENT_NAME', 'TOT_SALES'], ascending=[True, False])

brand_sales_high_value

Unnamed: 0,SEGMENT_NAME,BRAND,TOT_SALES
129,OLDER FAMILIES - Budget,Kettle,32058.00
133,OLDER FAMILIES - Budget,Smiths,20150.90
125,OLDER FAMILIES - Budget,Doritos,18555.55
131,OLDER FAMILIES - Budget,Pringles,14300.50
132,OLDER FAMILIES - Budget,RRD,9379.20
...,...,...,...
382,YOUNG SINGLES/COUPLES - Mainstream,Cheetos,898.80
381,YOUNG SINGLES/COUPLES - Mainstream,CCs,850.50
386,YOUNG SINGLES/COUPLES - Mainstream,French,429.00
394,YOUNG SINGLES/COUPLES - Mainstream,Sunbites,391.00


In [26]:
top_5_brands_each_segment = brand_sales_high_value.groupby('SEGMENT_NAME').head(3)
top_5_brands_each_segment

Unnamed: 0,SEGMENT_NAME,BRAND,TOT_SALES
129,OLDER FAMILIES - Budget,Kettle,32058.0
133,OLDER FAMILIES - Budget,Smiths,20150.9
125,OLDER FAMILIES - Budget,Doritos,18555.55
269,RETIREES - Mainstream,Kettle,31652.4
265,RETIREES - Mainstream,Doritos,18531.55
273,RETIREES - Mainstream,Smiths,16287.6
309,YOUNG FAMILIES - Budget,Kettle,26369.6
313,YOUNG FAMILIES - Budget,Smiths,16171.8
305,YOUNG FAMILIES - Budget,Doritos,16078.05
389,YOUNG SINGLES/COUPLES - Mainstream,Kettle,35423.6


## Customer Preference
> Q: Which brands do each segment buy the most of?

In [27]:
brand_volume = df.groupby(['SEGMENT_NAME', 'BRAND'])['PROD_QTY'].sum().reset_index()
brand_volume_high_value = brand_volume[brand_volume['SEGMENT_NAME'].isin(high_value_segments)]

brand_volume_high_value = brand_volume_high_value.sort_values(['SEGMENT_NAME','PROD_QTY'], ascending=[True, False])
most_bought_brand_each_segment = brand_volume_high_value.groupby('SEGMENT_NAME').head(3)
most_bought_brand_each_segment

Unnamed: 0,SEGMENT_NAME,BRAND,PROD_QTY
129,OLDER FAMILIES - Budget,Kettle,6492
133,OLDER FAMILIES - Budget,Smiths,5422
125,OLDER FAMILIES - Budget,Doritos,3953
269,RETIREES - Mainstream,Kettle,6428
273,RETIREES - Mainstream,Smiths,4233
271,RETIREES - Mainstream,Pringles,3995
309,YOUNG FAMILIES - Budget,Kettle,5328
313,YOUNG FAMILIES - Budget,Smiths,4322
305,YOUNG FAMILIES - Budget,Doritos,3442
389,YOUNG SINGLES/COUPLES - Mainstream,Kettle,7172


## Pack size preferences by segment

In [23]:
pack_analysis = df.groupby(['SEGMENT_NAME', 'PACK_SIZE']).agg(
    TOTAL_SALES = ('TOT_SALES', 'sum'),
    TOTAL_VOLUME = ('PROD_QTY', 'sum')   
).reset_index()

pack_pref = pack_analysis[pack_analysis['SEGMENT_NAME'].isin(high_value_segments)]

# Pack size preferences by volume
pack_pref_vol = pack_pref.sort_values(['SEGMENT_NAME', 'TOTAL_VOLUME'], ascending=[True,False])
top_pack_by_vol = pack_pref_vol.groupby('SEGMENT_NAME').head(3)


# Pack size preferences by total sales
pack_pref_sales = pack_pref.sort_values(['SEGMENT_NAME', 'TOTAL_SALES'], ascending=[True,False])
top_pack_by_sales = pack_pref_sales.groupby('SEGMENT_NAME').head(3)

In [24]:
top_pack_by_sales

Unnamed: 0,SEGMENT_NAME,PACK_SIZE,TOTAL_SALES,TOTAL_VOLUME
130,OLDER FAMILIES - Budget,175.0,42204.7,11322
126,OLDER FAMILIES - Budget,150.0,24733.9,6680
124,OLDER FAMILIES - Budget,134.0,14300.5,3865
270,RETIREES - Mainstream,175.0,38242.7,9967
266,RETIREES - Mainstream,150.0,23188.9,6006
264,RETIREES - Mainstream,134.0,14781.5,3995
310,YOUNG FAMILIES - Budget,175.0,35634.8,9528
306,YOUNG FAMILIES - Budget,150.0,19942.6,5348
304,YOUNG FAMILIES - Budget,134.0,12065.7,3261
390,YOUNG SINGLES/COUPLES - Mainstream,175.0,37967.9,9237


In [25]:
top_pack_by_vol

Unnamed: 0,SEGMENT_NAME,PACK_SIZE,TOTAL_SALES,TOTAL_VOLUME
130,OLDER FAMILIES - Budget,175.0,42204.7,11322
126,OLDER FAMILIES - Budget,150.0,24733.9,6680
124,OLDER FAMILIES - Budget,134.0,14300.5,3865
270,RETIREES - Mainstream,175.0,38242.7,9967
266,RETIREES - Mainstream,150.0,23188.9,6006
264,RETIREES - Mainstream,134.0,14781.5,3995
310,YOUNG FAMILIES - Budget,175.0,35634.8,9528
306,YOUNG FAMILIES - Budget,150.0,19942.6,5348
304,YOUNG FAMILIES - Budget,134.0,12065.7,3261
390,YOUNG SINGLES/COUPLES - Mainstream,175.0,37967.9,9237
