In [1]:
import pandas as pd
import numpy as np
import re
import os
import sys
import csv
import json
from datetime import datetime
from pathlib import Path

df = pd.read_excel('rawdata_cope.xlsx', sheet_name='cope')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 59 columns):
 #   Column                                                                                                           Non-Null Count  Dtype  
---  ------                                                                                                           --------------  -----  
 0   Response ID                                                                                                      307 non-null    int64  
 1   Gender                                                                                                           307 non-null    int64  
 2   Age                                                                                                              307 non-null    int64  
 3   Age_range                                                                                                        307 non-null    int64  
 4   Ethnicity                             

In [2]:
df.columns = df.columns.str.strip()
df.columns

Index(['Response ID', 'Gender', 'Age', 'Age_range', 'Ethnicity', 'State',
       'Avg_monthly_income',
       'Anchor Smooth:Please select all the beer/stout brands that you are aware of.',
       'Apple Fox:Please select all the beer/stout brands that you are aware of.',
       'Asahi:Please select all the beer/stout brands that you are aware of.',
       'Budweiser:Please select all the beer/stout brands that you are aware of.',
       'Carlsberg Danish Pilsner:Please select all the beer/stout brands that you are aware of.',
       'Carlsberg Smooth Draught:Please select all the beer/stout brands that you are aware of.',
       'Connor's Stout:Please select all the beer/stout brands that you are aware of.',
       'Corona Extra:Please select all the beer/stout brands that you are aware of.',
       'Edelweiss:Please select all the beer/stout brands that you are aware of.',
       'Guinness Stout:Please select all the beer/stout brands that you are aware of.',
       'Heineken:Please 

In [3]:
df['Avg_monthly_income'] = df['Avg_monthly_income'].astype(str).replace({
    '1': 'Below RM2500',
    '2': 'RM2500 – RM4999',
    '3': 'RM2500 – RM4999',
    '4': 'RM2500 – RM4999',
    '5': 'RM5000 – RM6999',
    '6': 'RM5000 – RM6999',
    '7': 'RM7000 – RM10999',
    '8': 'RM7000 – RM10999'
})

df['State'] = df['State'].astype(str).replace({
    '1': 'Penang',
    '2': 'Selangor',
    '3': 'WP',
    '4': 'Johor'
})

df['Ethnicity'] = df['Ethnicity'].astype(str).replace({
    '2': 'Chinese',
    '3': 'Indian',
})

df['Gender'] = df['Gender'].astype(str).replace({
    '1': 'Male',
    '2': 'Female',
})

df['Age_range'] = df['Age_range'].astype(str).replace({
    '2': '18-25',
    '3': '26-35'
})


df.drop(columns=['Please select the beer/stout brand that you purchase most often?'], inplace=True)

# df = df.rename(columns={
#     'Please select the diapers pants brand that you used most often.': 'Purchase Likelihood'
# })


# df = df.drop(df.columns[-3:], axis=1)

In [4]:
print(df['Age_range'].value_counts())
print('--------------')
print(df['Age_range'].isna().sum()) # No missing values

Age_range
18-25    214
26-35     93
Name: count, dtype: int64
--------------
0


In [5]:
columns = ['Gender', 'Ethnicity', 'State']

for col in columns:
    print(f"Column: {col}")
    print(df[col].value_counts(dropna=False))  # includes NaN counts in value_counts
    print(f"Missing values: {df[col].isna().sum()}")
    print("-" * 30)


Column: Gender
Gender
Male      243
Female     64
Name: count, dtype: int64
Missing values: 0
------------------------------
Column: Ethnicity
Ethnicity
Chinese    213
Indian      94
Name: count, dtype: int64
Missing values: 0
------------------------------
Column: State
State
Johor       84
Penang      78
WP          75
Selangor    70
Name: count, dtype: int64
Missing values: 0
------------------------------


In [6]:
df.columns

Index(['Response ID', 'Gender', 'Age', 'Age_range', 'Ethnicity', 'State',
       'Avg_monthly_income',
       'Anchor Smooth:Please select all the beer/stout brands that you are aware of.',
       'Apple Fox:Please select all the beer/stout brands that you are aware of.',
       'Asahi:Please select all the beer/stout brands that you are aware of.',
       'Budweiser:Please select all the beer/stout brands that you are aware of.',
       'Carlsberg Danish Pilsner:Please select all the beer/stout brands that you are aware of.',
       'Carlsberg Smooth Draught:Please select all the beer/stout brands that you are aware of.',
       'Connor's Stout:Please select all the beer/stout brands that you are aware of.',
       'Corona Extra:Please select all the beer/stout brands that you are aware of.',
       'Edelweiss:Please select all the beer/stout brands that you are aware of.',
       'Guinness Stout:Please select all the beer/stout brands that you are aware of.',
       'Heineken:Please 

In [7]:
brand_mapping = {
    "Anchor Smooth": "Anchor Smooth Brand",
    "Apple Fox": "Apple Fox Brand",
    "Asahi": "Asahi Brand",
    "Budweiser": "Budweiser Brand",
    "Carlsberg Danish Pilsner": "Carlsberg Danish Pilsner Brand",
    "Carlsberg Smooth Draught": "Carlsberg Smooth Draught Brand",
    "Connor's Stout": "Connor's Stout Brand",
    "Corona Extra": "Corona Extra Brand",
    "Edelweiss": "Edelweiss Brand",
    "Guinness Stout": "Guinness Stout Brand",
    "Heineken": "Heineken Brand",
    "Kronenbourg 1664 Blanc": "Kronenbourg 1664 Blanc Brand",
    "SKOL": "SKOL Brand",
    "Somersby": "Somersby Brand",
    "Tiger Beer": "Tiger Beer Brand",
    "Tiger Crystal": "Tiger Crystal Brand",
    "Tsingtao": "Tsingtao Brand"
}

for keyword, new_name in brand_mapping.items():
    for col in df.columns:
        if keyword in col and "aware" in col:
            df = df.rename(columns={col: new_name})
            break  # stop once found

In [8]:
brand_mapping = {
    "Anchor Smooth": "Anchor Smooth Current",
    "Apple Fox": "Apple Fox Current",
    "Asahi": "Asahi Current",
    "Budweiser": "Budweiser Current",
    "Carlsberg Danish Pilsner": "Carlsberg Danish Pilsner Current",
    "Carlsberg Smooth Draught": "Carlsberg Smooth Draught Current",
    "Connor's Stout": "Connor's Stout Current",
    "Corona Extra": "Corona Extra Current",
    "Edelweiss": "Edelweiss Current",
    "Guinness Stout": "Guinness Stout Current",
    "Heineken": "Heineken Current",
    "Kronenbourg 1664 Blanc": "Kronenbourg 1664 Blanc Current",
    "SKOL": "SKOL Current",
    "Somersby": "Somersby Current",
    "Tiger Beer": "Tiger Beer Current",
    "Tiger Crystal": "Tiger Crystal Current",
    "Tsingtao": "Tsingtao Current"
}

for keyword, new_name in brand_mapping.items():
    for col in df.columns:
        if keyword in col and "purchased" in col:
            df = df.rename(columns={col: new_name})
            break  # stop once found

In [9]:
brand_mapping = {
    "Anchor Smooth": "Anchor Smooth Purchase Likelihood",
    "Apple Fox": "Apple Fox Purchase Likelihood",
    "Asahi": "Asahi Purchase Likelihood",
    "Budweiser": "Budweiser Purchase Likelihood",
    "Carlsberg Danish Pilsner": "Carlsberg Danish Pilsner Purchase Likelihood",
    "Carlsberg Smooth Draught": "Carlsberg Smooth Draught Purchase Likelihood",
    "Connor's Stout": "Connor's Stout Purchase Likelihood",
    "Corona Extra": "Corona Extra Purchase Likelihood",
    "Edelweiss": "Edelweiss Purchase Likelihood",
    "Guinness Stout": "Guinness Stout Purchase Likelihood",
    "Heineken": "Heineken Purchase Likelihood",
    "Kronenbourg 1664 Blanc": "Kronenbourg 1664 Blanc Purchase Likelihood",
    "SKOL": "SKOL Purchase Likelihood",
    "Somersby": "Somersby Purchase Likelihood",
    "Tiger Beer": "Tiger Beer Purchase Likelihood",
    "Tiger Crystal": "Tiger Crystal Purchase Likelihood",
    "Tsingtao": "Tsingtao Purchase Likelihood"
}

for keyword, new_name in brand_mapping.items():
    for col in df.columns:
        if keyword in col and "NEVER" in col:
            df = df.rename(columns={col: new_name})
            break  # stop once found

In [10]:
df.columns

Index(['Response ID', 'Gender', 'Age', 'Age_range', 'Ethnicity', 'State',
       'Avg_monthly_income', 'Anchor Smooth Brand', 'Apple Fox Brand',
       'Asahi Brand', 'Budweiser Brand', 'Carlsberg Danish Pilsner Brand',
       'Carlsberg Smooth Draught Brand', 'Connor's Stout Brand',
       'Corona Extra Brand', 'Edelweiss Brand', 'Guinness Stout Brand',
       'Heineken Brand', 'Kronenbourg 1664 Blanc Brand', 'SKOL Brand',
       'Somersby Brand', 'Tiger Beer Brand', 'Tiger Crystal Brand',
       'Tsingtao Brand', 'Anchor Smooth Current', 'Apple Fox Current',
       'Asahi Current', 'Budweiser Current',
       'Carlsberg Danish Pilsner Current', 'Carlsberg Smooth Draught Current',
       'Connor's Stout Current', 'Corona Extra Current', 'Edelweiss Current',
       'Guinness Stout Current', 'Heineken Current',
       'Kronenbourg 1664 Blanc Current', 'SKOL Current', 'Somersby Current',
       'Tiger Beer Current', 'Tiger Crystal Current', 'Tsingtao Current',
       'Anchor Smooth Purch

### Pivoting

In [11]:
columns_to_process = df.columns.tolist()
columns_to_process

['Response ID',
 'Gender',
 'Age',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Brand',
 'Apple Fox Brand',
 'Asahi Brand',
 'Budweiser Brand',
 'Carlsberg Danish Pilsner Brand',
 'Carlsberg Smooth Draught Brand',
 "Connor's Stout Brand",
 'Corona Extra Brand',
 'Edelweiss Brand',
 'Guinness Stout Brand',
 'Heineken Brand',
 'Kronenbourg 1664 Blanc Brand',
 'SKOL Brand',
 'Somersby Brand',
 'Tiger Beer Brand',
 'Tiger Crystal Brand',
 'Tsingtao Brand',
 'Anchor Smooth Current',
 'Apple Fox Current',
 'Asahi Current',
 'Budweiser Current',
 'Carlsberg Danish Pilsner Current',
 'Carlsberg Smooth Draught Current',
 "Connor's Stout Current",
 'Corona Extra Current',
 'Edelweiss Current',
 'Guinness Stout Current',
 'Heineken Current',
 'Kronenbourg 1664 Blanc Current',
 'SKOL Current',
 'Somersby Current',
 'Tiger Beer Current',
 'Tiger Crystal Current',
 'Tsingtao Current',
 'Anchor Smooth Purchase Likelihood',
 'Apple Fox Purchase Likelihood',
 'Asahi Purc

In [12]:
# Include Response ID for count
columns_to_process = ['Response ID', 'Gender', 'Age_range', 'Ethnicity','State','Avg_monthly_income'] + columns_to_process

# Melt the DataFrame for processing
melted_df = df.melt(id_vars=['Gender', 'Age_range','Ethnicity','State','Avg_monthly_income'], value_vars=columns_to_process, 
                    var_name='Category', value_name='Value')

# Generate a pivot table without grand totals
pivot_table = pd.pivot_table(
    melted_df,
    values='Value',
    index=['Gender', 'Age_range','Ethnicity','State','Avg_monthly_income'],
    columns='Category',
    aggfunc='count',
    fill_value=0
)

# Reset index for cleaner output (optional)
pivot_table.reset_index(inplace=True)
print(pivot_table)

pivot_table.to_csv('checkcount_cope.csv')

Category  Gender Age_range Ethnicity     State Avg_monthly_income  Age  \
0         Female     18-25   Chinese     Johor    RM2500 – RM4999    2   
1         Female     18-25   Chinese     Johor    RM5000 – RM6999    4   
2         Female     18-25   Chinese     Johor   RM7000 – RM10999    3   
3         Female     18-25   Chinese    Penang    RM5000 – RM6999    4   
4         Female     18-25   Chinese    Penang   RM7000 – RM10999    2   
..           ...       ...       ...       ...                ...  ...   
64          Male     26-35    Indian    Penang    RM5000 – RM6999    2   
65          Male     26-35    Indian  Selangor    RM5000 – RM6999    1   
66          Male     26-35    Indian  Selangor   RM7000 – RM10999    1   
67          Male     26-35    Indian        WP    RM5000 – RM6999    1   
68          Male     26-35    Indian        WP   RM7000 – RM10999    1   

Category  Anchor Smooth Brand  Anchor Smooth Current  \
0                           1                      0   

In [13]:
total_rows = len(df)

# List of columns to process for current usage
awareness_columns = [col for col in df.columns if "Brand" in col]

# Loop through each column to calculate awareness and unawareness percentages
for column in awareness_columns:
    pivot_table[f'Unaware of {column}'] = pivot_table['Response ID'] - pivot_table[column]
    pivot_table[f'{column} Awareness (%)'] = (pivot_table[column] / total_rows) * 100
    pivot_table[f'{column} Unawareness (%)'] = (pivot_table[f'Unaware of {column}'] / total_rows) * 100

    # Round the percentages to 1 decimal place
    pivot_table[f'{column} Awareness (%)'] = pivot_table[f'{column} Awareness (%)']
    pivot_table[f'{column} Unawareness (%)'] = pivot_table[f'{column} Unawareness (%)']

# Drop the original columns after processing
pivot_table.drop(columns=awareness_columns, inplace=True)

# Drop columns starting with "Unaware of"
unaware_columns = [col for col in pivot_table.columns if col.startswith('Unaware of')]
pivot_table.drop(columns=unaware_columns, inplace=True)

# Define the columns for display, ensuring awareness and unawareness are side by side
display_columns1 = ['Gender', 'Age_range','Ethnicity','State','Avg_monthly_income']
for column in awareness_columns:
    display_columns1.extend([f'{column} Awareness (%)', f'{column} Unawareness (%)'])

print(pivot_table[display_columns1])

Category  Gender Age_range Ethnicity     State Avg_monthly_income  \
0         Female     18-25   Chinese     Johor    RM2500 – RM4999   
1         Female     18-25   Chinese     Johor    RM5000 – RM6999   
2         Female     18-25   Chinese     Johor   RM7000 – RM10999   
3         Female     18-25   Chinese    Penang    RM5000 – RM6999   
4         Female     18-25   Chinese    Penang   RM7000 – RM10999   
..           ...       ...       ...       ...                ...   
64          Male     26-35    Indian    Penang    RM5000 – RM6999   
65          Male     26-35    Indian  Selangor    RM5000 – RM6999   
66          Male     26-35    Indian  Selangor   RM7000 – RM10999   
67          Male     26-35    Indian        WP    RM5000 – RM6999   
68          Male     26-35    Indian        WP   RM7000 – RM10999   

Category  Anchor Smooth Brand Awareness (%)  \
0                                  0.325733   
1                                  0.000000   
2                             

In [14]:
total_rows = len(df)

# List of columns to process for current usage
usage_columns = [col for col in df.columns if "Current" in col]

# Loop through each column to calculate usage and non-usage percentages
for column in usage_columns:
    if column in pivot_table.columns:
        pivot_table[f'Not Using {column}'] = pivot_table['Response ID'] - pivot_table[column]
        pivot_table[f'{column} Usage (%)'] = (pivot_table[column] / total_rows) * 100
        pivot_table[f'{column} Non-usage (%)'] = (pivot_table[f'Not Using {column}'] / total_rows) * 100

        # Round the percentages to 1 decimal place
        pivot_table[f'{column} Usage (%)'] = pivot_table[f'{column} Usage (%)']
        pivot_table[f'{column} Non-usage (%)'] = pivot_table[f'{column} Non-usage (%)']
    else:
        print(f"Column '{column}' is missing in pivt_table.")

# Drop the original columns after processing
pivot_table.drop(columns=[col for col in usage_columns if col in pivot_table.columns], inplace=True)

# Drop columns starting with "Not Using"
not_using_columns = [col for col in pivot_table.columns if col.startswith('Not Using')]
pivot_table.drop(columns=not_using_columns, inplace=True)

# Define the columns for display, ensuring usage and non-usage are side by side
display_columns = ['Gender', 'Age_range', 'Ethnicity','State','Avg_monthly_income']
for column in usage_columns:
    if f'{column} Usage (%)' in pivot_table.columns and f'{column} Non-usage (%)' in pivot_table.columns:
        display_columns.extend([f'{column} Usage (%)', f'{column} Non-usage (%)'])

print(pivot_table[display_columns])

Category  Gender Age_range Ethnicity     State Avg_monthly_income  \
0         Female     18-25   Chinese     Johor    RM2500 – RM4999   
1         Female     18-25   Chinese     Johor    RM5000 – RM6999   
2         Female     18-25   Chinese     Johor   RM7000 – RM10999   
3         Female     18-25   Chinese    Penang    RM5000 – RM6999   
4         Female     18-25   Chinese    Penang   RM7000 – RM10999   
..           ...       ...       ...       ...                ...   
64          Male     26-35    Indian    Penang    RM5000 – RM6999   
65          Male     26-35    Indian  Selangor    RM5000 – RM6999   
66          Male     26-35    Indian  Selangor   RM7000 – RM10999   
67          Male     26-35    Indian        WP    RM5000 – RM6999   
68          Male     26-35    Indian        WP   RM7000 – RM10999   

Category  Anchor Smooth Current Usage (%)  \
0                                0.000000   
1                                0.000000   
2                                0.3

In [15]:
# total_rows = len(df)

# # List of columns to process for purchase likelihood
# purchase_columns = [col for col in df.columns if "Purchase" in col]

# # Loop through each column to calculate likely and unlikely percentages
# for column in purchase_columns:
#     if column in df.columns:
#         # Group and sum the purchase likelihood count per demographic group
#         purchase_counts = df.groupby(['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income'])[column].sum().reset_index()

#         # Rename the column for clarity
#         purchase_counts.rename(columns={column: f'{column}_count'}, inplace=True)

#         # Merge into pivot_table
#         pivot_table = pivot_table.merge(purchase_counts, on=['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income'], how='left')

#         # Fill NaNs with 0
#         pivot_table[f'{column}_count'] = pivot_table[f'{column}_count'].fillna(0)

#         # Calculate percentages
#         pivot_table[f'{column} (Likely)'] = (pivot_table[f'{column}_count'] / total_rows) * 100
#         pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[f'{column}_count']) / total_rows) * 100

#         # Round the results
#         pivot_table[f'{column} (Likely)'] = pivot_table[f'{column} (Likely)']
#         pivot_table[f'{column} (Unlikely)'] = pivot_table[f'{column} (Unlikely)']
#     else:
#         print(f"Column '{column}' is missing in df.")

# # Drop intermediate count columns
# intermediate_columns = [f'{col}_count' for col in purchase_columns if f'{col}_count' in pivot_table.columns]
# pivot_table.drop(columns=intermediate_columns, inplace=True)

# # Define display columns
# display_columns = ['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income']
# for column in purchase_columns:
#     if f'{column} (Likely)' in pivot_table.columns and f'{column} (Unlikely)' in pivot_table.columns:
#         display_columns.extend([f'{column} (Likely)', f'{column} (Unlikely)'])

# print(pivot_table[display_columns])

In [16]:
total_rows = len(df)

# List of columns related to Purchase Likelihood
likelihood_columns = [col for col in df.columns if "Purchase" in col]

# Loop through each column to calculate percentages
for column in likelihood_columns:
    pivot_table[f'{column} (Likely)'] = (pivot_table[column] / total_rows) * 100
    pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[column]) / total_rows) * 100

    # Round to 1 decimal place
    pivot_table[f'{column} (Likely)'] = pivot_table[f'{column} (Likely)']
    pivot_table[f'{column} (Unlikely)'] = pivot_table[f'{column} (Unlikely)']

# Drop original columns
pivot_table.drop(columns=likelihood_columns, inplace=True)

# Define columns to display
display_columns = ['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income']
for column in likelihood_columns:
    display_columns.extend([f'{column} (Likely)', f'{column} (Unlikely)'])

print(pivot_table[display_columns])

Category  Gender Age_range Ethnicity     State Avg_monthly_income  \
0         Female     18-25   Chinese     Johor    RM2500 – RM4999   
1         Female     18-25   Chinese     Johor    RM5000 – RM6999   
2         Female     18-25   Chinese     Johor   RM7000 – RM10999   
3         Female     18-25   Chinese    Penang    RM5000 – RM6999   
4         Female     18-25   Chinese    Penang   RM7000 – RM10999   
..           ...       ...       ...       ...                ...   
64          Male     26-35    Indian    Penang    RM5000 – RM6999   
65          Male     26-35    Indian  Selangor    RM5000 – RM6999   
66          Male     26-35    Indian  Selangor   RM7000 – RM10999   
67          Male     26-35    Indian        WP    RM5000 – RM6999   
68          Male     26-35    Indian        WP   RM7000 – RM10999   

Category  Anchor Smooth Purchase Likelihood (Likely)  \
0                                                0.0   
1                                                0.0   
2  

  pivot_table[f'{column} (Likely)'] = (pivot_table[column] / total_rows) * 100
  pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[column]) / total_rows) * 100
  pivot_table[f'{column} (Likely)'] = (pivot_table[column] / total_rows) * 100
  pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[column]) / total_rows) * 100
  pivot_table[f'{column} (Likely)'] = (pivot_table[column] / total_rows) * 100
  pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[column]) / total_rows) * 100
  pivot_table[f'{column} (Likely)'] = (pivot_table[column] / total_rows) * 100
  pivot_table[f'{column} (Unlikely)'] = ((pivot_table['Response ID'] - pivot_table[column]) / total_rows) * 100


In [17]:
pivot_table.head()

Category,Gender,Age_range,Ethnicity,State,Avg_monthly_income,Age,Response ID,Anchor Smooth Brand Awareness (%),Anchor Smooth Brand Unawareness (%),Apple Fox Brand Awareness (%),...,SKOL Purchase Likelihood (Likely),SKOL Purchase Likelihood (Unlikely),Somersby Purchase Likelihood (Likely),Somersby Purchase Likelihood (Unlikely),Tiger Beer Purchase Likelihood (Likely),Tiger Beer Purchase Likelihood (Unlikely),Tiger Crystal Purchase Likelihood (Likely),Tiger Crystal Purchase Likelihood (Unlikely),Tsingtao Purchase Likelihood (Likely),Tsingtao Purchase Likelihood (Unlikely)
0,Female,18-25,Chinese,Johor,RM2500 – RM4999,2,2,0.325733,0.325733,0.325733,...,0.0,0.651466,0.0,0.651466,0.0,0.651466,0.0,0.651466,0.0,0.651466
1,Female,18-25,Chinese,Johor,RM5000 – RM6999,4,4,0.0,1.302932,0.0,...,0.0,1.302932,0.0,1.302932,0.0,1.302932,0.0,1.302932,0.0,1.302932
2,Female,18-25,Chinese,Johor,RM7000 – RM10999,3,3,0.651466,0.325733,0.651466,...,0.0,0.977199,0.0,0.977199,0.0,0.977199,0.0,0.977199,0.0,0.977199
3,Female,18-25,Chinese,Penang,RM5000 – RM6999,4,4,0.977199,0.325733,0.977199,...,0.0,1.302932,0.0,1.302932,0.0,1.302932,0.0,1.302932,0.0,1.302932
4,Female,18-25,Chinese,Penang,RM7000 – RM10999,2,2,0.325733,0.325733,0.325733,...,0.0,0.651466,0.0,0.651466,0.0,0.651466,0.0,0.651466,0.0,0.651466


In [18]:
pivot_table = pivot_table.drop(columns=['Response ID'])
pivot_table.columns

Index(['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income',
       'Age', 'Anchor Smooth Brand Awareness (%)',
       'Anchor Smooth Brand Unawareness (%)', 'Apple Fox Brand Awareness (%)',
       'Apple Fox Brand Unawareness (%)',
       ...
       'SKOL Purchase Likelihood (Likely)',
       'SKOL Purchase Likelihood (Unlikely)',
       'Somersby Purchase Likelihood (Likely)',
       'Somersby Purchase Likelihood (Unlikely)',
       'Tiger Beer Purchase Likelihood (Likely)',
       'Tiger Beer Purchase Likelihood (Unlikely)',
       'Tiger Crystal Purchase Likelihood (Likely)',
       'Tiger Crystal Purchase Likelihood (Unlikely)',
       'Tsingtao Purchase Likelihood (Likely)',
       'Tsingtao Purchase Likelihood (Unlikely)'],
      dtype='object', name='Category', length=108)

In [19]:
merged_cope = pivot_table

merged_cope.to_csv('pivot-cope.csv',float_format='%.1f', index=False)
merged_cope.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Columns: 108 entries, Gender to Tsingtao Purchase Likelihood (Unlikely)
dtypes: float64(102), int64(1), object(5)
memory usage: 58.3+ KB


### Preparing each df for each sheets in xlsx

In [20]:
columns_to_process1 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Current Usage (%)" in col]
columns_to_process1

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Current Usage (%)',
 'Apple Fox Current Usage (%)',
 'Asahi Current Usage (%)',
 'Budweiser Current Usage (%)',
 'Carlsberg Danish Pilsner Current Usage (%)',
 'Carlsberg Smooth Draught Current Usage (%)',
 "Connor's Stout Current Usage (%)",
 'Corona Extra Current Usage (%)',
 'Edelweiss Current Usage (%)',
 'Guinness Stout Current Usage (%)',
 'Heineken Current Usage (%)',
 'Kronenbourg 1664 Blanc Current Usage (%)',
 'SKOL Current Usage (%)',
 'Somersby Current Usage (%)',
 'Tiger Beer Current Usage (%)',
 'Tiger Crystal Current Usage (%)',
 'Tsingtao Current Usage (%)']

In [21]:
columns_to_process2 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Current Non-usage (%)" in col]
columns_to_process2

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Current Non-usage (%)',
 'Apple Fox Current Non-usage (%)',
 'Asahi Current Non-usage (%)',
 'Budweiser Current Non-usage (%)',
 'Carlsberg Danish Pilsner Current Non-usage (%)',
 'Carlsberg Smooth Draught Current Non-usage (%)',
 "Connor's Stout Current Non-usage (%)",
 'Corona Extra Current Non-usage (%)',
 'Edelweiss Current Non-usage (%)',
 'Guinness Stout Current Non-usage (%)',
 'Heineken Current Non-usage (%)',
 'Kronenbourg 1664 Blanc Current Non-usage (%)',
 'SKOL Current Non-usage (%)',
 'Somersby Current Non-usage (%)',
 'Tiger Beer Current Non-usage (%)',
 'Tiger Crystal Current Non-usage (%)',
 'Tsingtao Current Non-usage (%)']

In [22]:
columns_to_process3 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Purchase Likelihood (Likely)" in col]
columns_to_process3

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Purchase Likelihood (Likely)',
 'Apple Fox Purchase Likelihood (Likely)',
 'Asahi Purchase Likelihood (Likely)',
 'Budweiser Purchase Likelihood (Likely)',
 'Carlsberg Danish Pilsner Purchase Likelihood (Likely)',
 'Carlsberg Smooth Draught Purchase Likelihood (Likely)',
 "Connor's Stout Purchase Likelihood (Likely)",
 'Corona Extra Purchase Likelihood (Likely)',
 'Edelweiss Purchase Likelihood (Likely)',
 'Guinness Stout Purchase Likelihood (Likely)',
 'Heineken Purchase Likelihood (Likely)',
 'Kronenbourg 1664 Blanc Purchase Likelihood (Likely)',
 'SKOL Purchase Likelihood (Likely)',
 'Somersby Purchase Likelihood (Likely)',
 'Tiger Beer Purchase Likelihood (Likely)',
 'Tiger Crystal Purchase Likelihood (Likely)',
 'Tsingtao Purchase Likelihood (Likely)']

In [23]:
columns_to_process4 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Purchase Likelihood (Unlikely)" in col]
columns_to_process4

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Purchase Likelihood (Unlikely)',
 'Apple Fox Purchase Likelihood (Unlikely)',
 'Asahi Purchase Likelihood (Unlikely)',
 'Budweiser Purchase Likelihood (Unlikely)',
 'Carlsberg Danish Pilsner Purchase Likelihood (Unlikely)',
 'Carlsberg Smooth Draught Purchase Likelihood (Unlikely)',
 "Connor's Stout Purchase Likelihood (Unlikely)",
 'Corona Extra Purchase Likelihood (Unlikely)',
 'Edelweiss Purchase Likelihood (Unlikely)',
 'Guinness Stout Purchase Likelihood (Unlikely)',
 'Heineken Purchase Likelihood (Unlikely)',
 'Kronenbourg 1664 Blanc Purchase Likelihood (Unlikely)',
 'SKOL Purchase Likelihood (Unlikely)',
 'Somersby Purchase Likelihood (Unlikely)',
 'Tiger Beer Purchase Likelihood (Unlikely)',
 'Tiger Crystal Purchase Likelihood (Unlikely)',
 'Tsingtao Purchase Likelihood (Unlikely)']

In [24]:
columns_to_process5 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Awareness" in col]
columns_to_process5

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Brand Awareness (%)',
 'Apple Fox Brand Awareness (%)',
 'Asahi Brand Awareness (%)',
 'Budweiser Brand Awareness (%)',
 'Carlsberg Danish Pilsner Brand Awareness (%)',
 'Carlsberg Smooth Draught Brand Awareness (%)',
 "Connor's Stout Brand Awareness (%)",
 'Corona Extra Brand Awareness (%)',
 'Edelweiss Brand Awareness (%)',
 'Guinness Stout Brand Awareness (%)',
 'Heineken Brand Awareness (%)',
 'Kronenbourg 1664 Blanc Brand Awareness (%)',
 'SKOL Brand Awareness (%)',
 'Somersby Brand Awareness (%)',
 'Tiger Beer Brand Awareness (%)',
 'Tiger Crystal Brand Awareness (%)',
 'Tsingtao Brand Awareness (%)']

In [25]:
columns_to_process6 = merged_cope.columns[0:5].tolist() + [col for col in merged_cope.columns if "Unawareness" in col]
columns_to_process6

['Gender',
 'Age_range',
 'Ethnicity',
 'State',
 'Avg_monthly_income',
 'Anchor Smooth Brand Unawareness (%)',
 'Apple Fox Brand Unawareness (%)',
 'Asahi Brand Unawareness (%)',
 'Budweiser Brand Unawareness (%)',
 'Carlsberg Danish Pilsner Brand Unawareness (%)',
 'Carlsberg Smooth Draught Brand Unawareness (%)',
 "Connor's Stout Brand Unawareness (%)",
 'Corona Extra Brand Unawareness (%)',
 'Edelweiss Brand Unawareness (%)',
 'Guinness Stout Brand Unawareness (%)',
 'Heineken Brand Unawareness (%)',
 'Kronenbourg 1664 Blanc Brand Unawareness (%)',
 'SKOL Brand Unawareness (%)',
 'Somersby Brand Unawareness (%)',
 'Tiger Beer Brand Unawareness (%)',
 'Tiger Crystal Brand Unawareness (%)',
 'Tsingtao Brand Unawareness (%)']

In [26]:
# Create DataFrames for each set of columns
df1 = merged_cope[columns_to_process1]
df2 = merged_cope[columns_to_process2]
df3 = merged_cope[columns_to_process3]
df4 = merged_cope[columns_to_process4]
df5 = merged_cope[columns_to_process5]
df6 = merged_cope[columns_to_process6]

In [27]:
# Define a list of tuples with the dataframe and the string to replace
dfs_info = [
    (df1, "Current Usage (%)"),
    (df2, "Current Non-usage (%)"),
    (df3, "Purchase Likelihood (Likely)"),
    (df4, "Purchase Likelihood (Unlikely)"),
    (df5, "Brand Awareness (%)"),
    (df6, "Brand Unawareness (%)")
]

# List to store the transformed DataFrames
melted_dfs = []

# Loop through each and apply the same operation
for df, replace_str in dfs_info:
    melted = df.melt(
        id_vars=['Gender', 'Age_range', 'Ethnicity', 'State', 'Avg_monthly_income'],
        var_name='Brand',
        value_name='Percentage'
    )
    melted['Brand'] = melted['Brand'].str.replace(replace_str, '', regex=False)
    melted_dfs.append(melted)

# Unpack the results back to original variables
df1, df2, df3, df4, df5, df6 = melted_dfs


In [28]:
# Dictionary to store DataFrames with their respective attributes
dataframes = {
    'Current Usage': df1,
    'Current Non-usage': df2,
    'Purchase Likelihood (Likely)': df3,
    'Purchase Likelihood (Unlikely)': df4,
    'Awareness': df5,
    'Unawareness': df6
}

def add_category_and_attribute(df, attribute_name):
    df['Category'] = 'Beverages'
    df['Country'] = 'Malaysia'
    df['Year'] = 2022
    df['Attributes'] = attribute_name
    return df

# Process all DataFrames and filter out empty ones
processed_dfs = [add_category_and_attribute(df, name) for name, df in dataframes.items() if df is not None and not df.empty]

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(processed_dfs, ignore_index=True) if processed_dfs else None

# Generate Composite Key after merging
if final_df is not None:
    # Ensure all columns in the composite key are strings
    final_df['Composite_Key'] = (
        final_df['Ethnicity'].astype(str) + '_' +
        # final_df['Brand'].astype(str) + '_' +
        final_df['Age_range'].astype(str) + '_' +
        final_df['Gender'].astype(str) + '_' +
        final_df['Avg_monthly_income'].astype(str) + '_' +
        final_df['State'].astype(str)
        # final_df['Category'].astype(str) + '_' +
        # final_df['Country'].astype(str) + '_' +
        # final_df['Year'].astype(str) + '_' +
        # final_df['Attributes'].astype(str)
    )



# # Define the output directory
# output_dir = Path(r"C:\Users\aisar\OneDrive\Documents\Mobile-Data-App\BLS\CLEANED")
# output_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists

# # Define the file path
# output_file = output_dir / "maxis-final.xlsx"

# # Write to Excel
# final_df.to_excel(output_file, sheet_name='Maxis', float_format='%.1f', index=False)
# print(f"Excel file created successfully: {output_file}")

In [29]:
# Step 1: Replace 'Likely' with a temp placeholder
final_df['Attributes'] = final_df['Attributes'].astype(str).replace({
    'Purchase Likelihood (Likely)': 'TEMP_SWAP'
})

# Step 2: Replace 'Unlikely' with 'Likely'
final_df['Attributes'] = final_df['Attributes'].replace({
    'Purchase Likelihood (Unlikely)': 'Purchase Likelihood (Likely)'
})

# Step 3: Replace the temp with 'Unlikely'
final_df['Attributes'] = final_df['Attributes'].replace({
    'TEMP_SWAP': 'Purchase Likelihood (Unlikely)'
})


In [30]:
# Remove rows where Percentage is 0
final_df = final_df[final_df['Percentage'] != 0]

# Write to Excel
final_df.to_excel('Cope-final.xlsx', sheet_name='Cope', index=False)
print("Excel file created successfully!")

Excel file created successfully!


In [31]:
# final_df['Attributes'] = final_df['Attributes'].astype(str).replace({
#     'Purchase Likelihood (Likely)': 'Purchase Likelihood (Unlikely)1'
# })

In [32]:
# final_df['Attributes'] = final_df['Attributes'].astype(str).replace({
#     'Purchase Likelihood (Unlikely)': 'Purchase Likelihood (Likely)'
# })

In [33]:
# final_df['Attributes'] = final_df['Attributes'].astype(str).replace({
#     'Purchase Likelihood (Unlikely)1': 'Purchase Likelihood (Unlikely)'
# })

In [34]:
print(final_df['Attributes'].value_counts())
print('--------------')
print(final_df['Attributes'].isna().sum()) # No missing values

Attributes
Purchase Likelihood (Likely)      1171
Current Non-usage                 1057
Unawareness                        883
Awareness                          801
Current Usage                      471
Purchase Likelihood (Unlikely)      37
Name: count, dtype: int64
--------------
0


In [35]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4420 entries, 2 to 7037
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Gender              4420 non-null   object 
 1   Age_range           4420 non-null   object 
 2   Ethnicity           4420 non-null   object 
 3   State               4420 non-null   object 
 4   Avg_monthly_income  4420 non-null   object 
 5   Brand               4420 non-null   object 
 6   Percentage          4420 non-null   float64
 7   Category            4420 non-null   object 
 8   Country             4420 non-null   object 
 9   Year                4420 non-null   int64  
 10  Attributes          4420 non-null   object 
 11  Composite_Key       4420 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 448.9+ KB


In [36]:
final_df['Percentage'] = pd.to_numeric(final_df['Percentage'], errors='coerce')
attribute_percentages = final_df.groupby('Attributes')['Percentage'].sum().reset_index()
attribute_percentages['Percentage'] = attribute_percentages['Percentage'].round(1)
print(attribute_percentages)

                       Attributes  Percentage
0                       Awareness       687.0
1               Current Non-usage      1374.6
2                   Current Usage       325.4
3    Purchase Likelihood (Likely)      1687.3
4  Purchase Likelihood (Unlikely)        12.7
5                     Unawareness      1013.0


In [37]:
# Convert Percentage to numeric
final_df['Percentage'] = pd.to_numeric(final_df['Percentage'], errors='coerce')

# Create pivot table
pivot = final_df.pivot_table(
    index='Attributes',
    columns='Brand',
    values='Percentage',
    aggfunc='sum',
    margins=True,
    margins_name='Grand Total'
).round(1)

# Replace NaN with empty strings (optional)
pivot = pivot.fillna('')

# Rename columns/rows to match the example (if needed)
pivot = pivot.rename_axis('Row Labels', axis=0).rename_axis('Column Labels', axis=1)

print(pivot)

Column Labels                   Anchor Smooth   Apple Fox   Asahi   \
Row Labels                                                           
Awareness                                 28.0        33.9    25.7   
Current Non-usage                         93.2        89.6    92.8   
Current Usage                              6.8        10.4     7.2   
Purchase Likelihood (Likely)              99.0        99.7    99.0   
Purchase Likelihood (Unlikely)             1.0         0.3     1.0   
Unawareness                               72.0        66.1    74.3   
Grand Total                              300.0       300.0   300.0   

Column Labels                   Budweiser   Carlsberg Danish Pilsner   \
Row Labels                                                              
Awareness                             39.7                       39.1   
Current Non-usage                     87.3                       87.3   
Current Usage                         12.7                       12.7   
Purc

In [38]:
import pandas as pd

# Ensure Percentage is numeric and handle NaNs
final_df['Percentage'] = pd.to_numeric(final_df['Percentage'], errors='coerce')

# Create pivot table with 0 for missing values
pivot = final_df.pivot_table(
    index='Attributes',
    columns='Brand',
    values='Percentage',
    aggfunc='sum',
    fill_value=0,  # Critical for matching Excel's behavior
    margins=True,
    margins_name='Grand Total'
).round(1)

# Reset index/columns for clarity
pivot = pivot.rename_axis('Row Labels', axis=0).rename_axis('Column Labels', axis=1)
print(pivot)

Column Labels                   Anchor Smooth   Apple Fox   Asahi   \
Row Labels                                                           
Awareness                                 28.0        33.9    25.7   
Current Non-usage                         93.2        89.6    92.8   
Current Usage                              6.8        10.4     7.2   
Purchase Likelihood (Likely)              99.0        99.7    99.0   
Purchase Likelihood (Unlikely)             1.0         0.3     1.0   
Unawareness                               72.0        66.1    74.3   
Grand Total                              300.0       300.0   300.0   

Column Labels                   Budweiser   Carlsberg Danish Pilsner   \
Row Labels                                                              
Awareness                             39.7                       39.1   
Current Non-usage                     87.3                       87.3   
Current Usage                         12.7                       12.7   
Purc