# Analysing inconsistencites and data integrity after basic profiling

## Checking purchase behaviour dataset

In [None]:
import pandas as pd

df_csv = pd.read_csv('../raw_data/QVI_purchase_behaviour.csv')

df_csv['PREMIUM_CUSTOMER'].value_counts(dropna=False)


PREMIUM_CUSTOMER
Mainstream    29245
Budget        24470
Premium       18922
Name: count, dtype: int64

In [8]:
df_csv['LIFESTAGE'].value_counts(dropna=False)

LIFESTAGE
RETIREES                  14805
OLDER SINGLES/COUPLES     14609
YOUNG SINGLES/COUPLES     14441
OLDER FAMILIES             9780
YOUNG FAMILIES             9178
MIDAGE SINGLES/COUPLES     7275
NEW FAMILIES               2549
Name: count, dtype: int64

## Checking tansaction data dataset

### Inspecting outlier in purchase quantity

In [18]:
df_xlsx = pd.read_excel('../raw_data/QVI_transaction_data.xlsx')


# Inspecting the max value being 200 in PROD_QTY during basic profiling
df_xlsx['PROD_QTY'].value_counts(dropna=False)


PROD_QTY
2      236039
1       27518
5         450
3         430
4         397
200         2
Name: count, dtype: int64

In [19]:
# Searching for customer purchasing 200 chips in a transaction
df_xlsx[df_xlsx['PROD_QTY'] == 200]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
69762,43331,226,226000,226201,4,Dorito Corn Chp Supreme 380g,200,650.0
69763,43605,226,226000,226210,4,Dorito Corn Chp Supreme 380g,200,650.0


In [20]:
# Checking other transactions made by same customer
df_xlsx[df_xlsx['LYLTY_CARD_NBR'] == 226000]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
69762,43331,226,226000,226201,4,Dorito Corn Chp Supreme 380g,200,650.0
69763,43605,226,226000,226210,4,Dorito Corn Chp Supreme 380g,200,650.0


> Note: A customer with Loyalty Card Number '226000' has been identified as a probable commercial client buying in bulk. They are excluded from the following consumer analysis to prevent skewing of the data.

In [26]:

cleaned_df_xlsx = df_xlsx[df_xlsx['LYLTY_CARD_NBR'] != 226000].copy()
cleaned_df_xlsx.info() # number of non-null entreis decreases by 2 compared to initial profiling

# Converted Excel serial date to datetime format
cleaned_df_xlsx['DATE'] = pd.to_datetime(cleaned_df_xlsx['DATE'], unit='D', origin='1899-12-30')

print(cleaned_df_xlsx.head())


<class 'pandas.core.frame.DataFrame'>
Index: 264834 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DATE            264834 non-null  int64  
 1   STORE_NBR       264834 non-null  int64  
 2   LYLTY_CARD_NBR  264834 non-null  int64  
 3   TXN_ID          264834 non-null  int64  
 4   PROD_NBR        264834 non-null  int64  
 5   PROD_NAME       264834 non-null  object 
 6   PROD_QTY        264834 non-null  int64  
 7   TOT_SALES       264834 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 18.2+ MB
        DATE  STORE_NBR  LYLTY_CARD_NBR  TXN_ID  PROD_NBR  \
0 2018-10-17          1            1000       1         5   
1 2019-05-14          1            1307     348        66   
2 2019-05-20          1            1343     383        61   
3 2018-08-17          2            2373     974        69   
4 2018-08-18          2            2426    1038       108   

     

### Inspecting inconsistencies in product name (PROD_NAME)

In [None]:
# top most sold product names

cleaned_df_xlsx['PROD_NAME'].value_counts().head(10)

PROD_NAME
Kettle Mozzarella   Basil & Pesto 175g      3304
Kettle Tortilla ChpsHny&Jlpno Chili 150g    3296
Cobs Popd Swt/Chlli &Sr/Cream Chips 110g    3269
Tyrrells Crisps     Ched & Chives 165g      3268
Cobs Popd Sea Salt  Chips 110g              3265
Kettle 135g Swt Pot Sea Salt                3257
Tostitos Splash Of  Lime 175g               3252
Infuzions Thai SweetChili PotatoMix 110g    3242
Smiths Crnkle Chip  Orgnl Big Bag 380g      3233
Thins Potato Chips  Hot & Spicy 175g        3229
Name: count, dtype: int64

In [None]:
# least sold product names

cleaned_df_xlsx['PROD_NAME'].value_counts(ascending=True).head(10)

PROD_NAME
WW Crinkle Cut      Original 175g           1410
French Fries Potato Chips 175g              1418
NCC Sour Cream &    Garden Chives 175g      1419
Woolworths Medium   Salsa 300g              1430
RRD Pc Sea Salt     165g                    1431
Sunbites Whlegrn    Crisps Frch/Onin 90g    1432
Red Rock Deli Chikn&Garlic Aioli 150g       1434
Smiths Crinkle Cut  French OnionDip 150g    1438
Smiths Chip Thinly  CutSalt/Vinegr175g      1440
Thins Chips         Originl saltd 175g      1441
Name: count, dtype: int64

In [43]:
# checking if transaction record contains sales of both 'chips' and 'salsa' together in PROD_NAME
cleaned_df_xlsx[cleaned_df_xlsx['PROD_NAME'].str.contains('salsa', case=False) & cleaned_df_xlsx['PROD_NAME'].str.contains('chips', case=False)]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES


> The transaction table records sales of each product as separate entites and doesn't bundle them together under a single product name.

In [51]:
# selecting only chips related records while removing salsa, dips, etc. record

chips_only_df = cleaned_df_xlsx[~cleaned_df_xlsx['PROD_NAME'].str.contains(r'dip|salsa', case=False, regex=True)]
chips_only_df.head(10)


Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
0,2018-10-17,1,1000,1,5,Natural Chip Compny SeaSalt175g,2,6.0
1,2019-05-14,1,1307,348,66,CCs Nacho Cheese 175g,3,6.3
2,2019-05-20,1,1343,383,61,Smiths Crinkle Cut Chips Chicken 170g,2,2.9
3,2018-08-17,2,2373,974,69,Smiths Chip Thinly S/Cream&Onion 175g,5,15.0
4,2018-08-18,2,2426,1038,108,Kettle Tortilla ChpsHny&Jlpno Chili 150g,3,13.8
6,2019-05-16,4,4149,3333,16,Smiths Crinkle Chips Salt & Vinegar 330g,1,5.7
7,2019-05-16,4,4196,3539,24,Grain Waves Sweet Chilli 210g,1,3.6
8,2018-08-20,5,5026,4525,42,Doritos Corn Chip Mexican Jalapeno 150g,1,3.9
9,2018-08-18,7,7150,6900,52,Grain Waves Sour Cream&Chives 210G,2,7.2
10,2019-05-17,7,7215,7176,16,Smiths Crinkle Chips Salt & Vinegar 330g,1,5.7


In [None]:
# final inspection of product names which may contain nuts, trail mix, crackers, etc.
chips_only_df[chips_only_df['PROD_NAME'].str.contains(r'nut|cracker|trail', case=False, regex=True)]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
55,2019-05-16,72,72335,72083,87,Infuzions BBQ Rib Prawn Crackers 110g,1,3.8
92,2019-05-18,120,120226,123743,67,RRD Chilli& Coconut 150g,1,2.7
118,2019-05-19,164,164183,164919,67,RRD Chilli& Coconut 150g,1,2.7
261,2019-02-24,4,4072,2970,87,Infuzions BBQ Rib Prawn Crackers 110g,2,7.6
273,2019-04-07,4,4149,3332,87,Infuzions BBQ Rib Prawn Crackers 110g,2,7.6
...,...,...,...,...,...,...,...,...
264550,2018-07-01,259,259226,258901,67,RRD Chilli& Coconut 150g,2,5.4
264755,2018-10-22,268,268463,264916,87,Infuzions BBQ Rib Prawn Crackers 110g,1,3.8
264771,2019-04-21,269,269203,266266,87,Infuzions BBQ Rib Prawn Crackers 110g,2,7.6
264778,2019-06-02,269,269204,266273,67,RRD Chilli& Coconut 150g,2,5.4


### Feature extraction