In [110]:
"""
project_grp16_v2.ipynb
This notebook performs data analysis on the SnackChain dataset, demonstrating
cleaning, exploratory data analysis, and modeling steps.
"""

'\nproject_grp16_v2.ipynb\nThis notebook performs data analysis on the SnackChain dataset, demonstrating\ncleaning, exploratory data analysis, and modeling steps.\n'

In [111]:
# Currently in use
import re                               # Extraction of text
import pandas as pd                     # Data manipulation
import numpy as np                      # Numeric computations

### Clean 'stores.xlsx'

In [112]:
# Import the stores.xlsx file
df_stores = pd.read_excel('stores.xlsx')

In [113]:
# Convert column headers to lowercase
df_stores.columns = df_stores.columns.str.lower()
# Convert column data to lower case
df_stores = df_stores.map(lambda x: x.lower() if isinstance(x, str) else x)

### Clean 'products.xlsx'

In [114]:
# Import the products.xlsx file
df_products = pd.read_excel('products.xlsx')

In [115]:
# Convert column headers to lower case
df_products.columns = df_products.columns.str.lower()
# Convert column data to lower case
df_products = df_products.map(lambda x: x.lower() if isinstance(x, str) else x)

### Clean 'transactions.xlsx'

In [116]:
# Import the transactions.xlsx file
df_transactions = pd.read_excel('transactions.xlsx')

In [117]:
# Convert column headers to lower case
df_transactions.columns = df_transactions.columns.str.lower()
# Convert column data to lower case
df_transactions = df_transactions.map(lambda x: x.lower() if isinstance(x, str) else x)

### Remove Oral Hygiene Products

In [118]:
# Create a clean copy of the dataframe to avoid chained assignment warnings
df_prod_cln = df_products[df_products['category'] != 'oral hygiene products'].copy()

In [119]:
# Remove 'oral hygiene products' based on UPCs in df_prod_cln 
# The products df only without the oral hygiene products)
keep_upcs = df_prod_cln['upc'].unique()

# Only keep the UPCs in df_prod_cln
df_trans_cln =df_transactions[df_transactions['upc'].isin(keep_upcs)]


### Standardize Product Size

In [120]:
# Function to extract numeric value and unit using regex
def extract_size_details(size):
    """
    Extracts a numeric value and unit from a product_size string using regex.
    
    Args:
        size (str): A string representing the product size (e.g., "15 OZ", "1.5 LT").
    
    Returns:
        pd.Series: A pandas Series containing the extracted value (float) and unit (str).
    """
    if isinstance(size, str):
        match = re.match(r"(\d*\.?\d+)\s*([a-zA-Z]+)", size)  # Extract number and unit
        if match:
            return pd.Series([float(match.group(1)), match.group(2).lower()])
    return pd.Series([None, None])  # Handle missing or incorrect formats
# Apply function to extract size and unit safely
df_prod_cln[['size_value', 'size_unit']] = df_prod_cln['product_size'].apply(
    extract_size_details)
# Define conversion rates in a dictionary
conversion_rates = {"ml": 1, "oz": 29.5735, "lt": 1000}

df_prod_cln['size_in_ml'] = df_prod_cln.apply(
    lambda row: row['size_value'] * conversion_rates.get(row['size_unit'], np.nan)
    if pd.notna(row['size_value']) and pd.notna(row['size_unit']) else np.nan, axis=1
)
# Drop the original product_size column since it's no longer needed
df_prod_cln.drop(columns=['product_size'], inplace=True)
# Debug: Check conversion results
print(df_prod_cln[['size_value', 'size_unit', 'size_in_ml']].head())
# Verify if any unknown units exist
unknown_units = df_prod_cln[~df_prod_cln['size_unit'].isin(conversion_rates.keys()) &
                            df_prod_cln['size_unit'].notna()]
if not unknown_units.empty:
    print("\n Unknown Units Found:\n", unknown_units['size_unit'].unique())

   size_value size_unit  size_in_ml
0       15.00        oz  443.602500
1       15.00        oz  443.602500
2       15.00        oz  443.602500
6       12.25        oz  362.275375
7       20.00        oz  591.470000


In [121]:
# Count unique values in each column
unique_size_value = df_prod_cln['size_value'].nunique(dropna=True)
unique_size_unit = df_prod_cln['size_unit'].nunique(dropna=True)
unique_size_in_ml = df_prod_cln['size_in_ml'].nunique(dropna=True)

# Display results
print(f"Unique size_value count: {unique_size_value}")
print(f"Unique size_unit count: {unique_size_unit}")
print(f"Unique size_in_ml count: {unique_size_in_ml}")

Unique size_value count: 25
Unique size_unit count: 1
Unique size_in_ml count: 25


In [122]:
# Debug: Check if column is removed
print(df_prod_cln.head())  # Ensure product_size is gone

          upc               description   manufacturer     category  \
0  1111009477    pl mini twist pretzels  private label   bag snacks   
1  1111009497         pl pretzel sticks  private label   bag snacks   
2  1111009507         pl twist pretzels  private label   bag snacks   
6  1111085319  pl honey nut toastd oats  private label  cold cereal   
7  1111085345            pl raisin bran  private label  cold cereal   

        sub_category  size_value size_unit  size_in_ml  
0           pretzels       15.00        oz  443.602500  
1           pretzels       15.00        oz  443.602500  
2           pretzels       15.00        oz  443.602500  
6  all family cereal       12.25        oz  362.275375  
7       adult cereal       20.00        oz  591.470000  


In [123]:
# Check missing values in all dataframes
print("Missing values in df_stores:")
print(df_stores.isnull().sum())
print('------------------------')
print("\nMissing values in df_prod_cln:")
print(df_prod_cln.isnull().sum())
print('------------------------')
print("\nMissing values in df_trans_cln:")
print(df_trans_cln.isnull().sum())

Missing values in df_stores:
store_id               0
store_name             0
city                   0
state                  0
msa                    0
segment                0
parking               52
size                   0
avg_weekly_baskets     0
unnamed: 9            79
unnamed: 10           76
dtype: int64
------------------------

Missing values in df_prod_cln:
upc             0
description     0
manufacturer    0
category        0
sub_category    0
size_value      0
size_unit       0
size_in_ml      0
dtype: int64
------------------------

Missing values in df_trans_cln:
week_end_date      0
store_num          0
upc                0
units              0
visits             0
hhs                0
spend              0
price             10
base_price       173
feature            0
display            0
tpr_only           0
dtype: int64


In [124]:
# Drop 'parking' from df_stores (35% of df_stores have parking values.)
df_stores.drop(columns=['parking'], inplace=True)