# Step 1: Data Collection & Setup

In [7]:
# Import necessary libraries
import pandas as pd
import openpyxl

## A. Grocery Market Data 

This dataset was taken from the U.S. Department of Agriculture Agricultural Marketing Service's (AMS) [Market News Report](https://marketnews.usda.gov/mnp/dataDownload"), which combines data from various AMS Market News reporting categories, including Dairy & milk, Fruits, vegetables, & specialty crops, Livestock, meats, poultry, eggs, grain, & hay, Organic, and Local & regional foods into a single dataset.

In [None]:
# Create a dataframe from the USDA AMS Market News Report
usda_ams_retail_df = pd.read_csv('MNRetailDatasetCSV.CSV')

  usda_ams_retail_df = pd.read_csv('MNRetailDatasetCSV.CSV')


In [8]:
# Get an overview of the df with a .head()
usda_ams_retail_df.head(5)

Unnamed: 0,LEVEL_OF_TRADE,FREQUENCY,ISSUING_OFFICE,REPORT_DATE,PROGRAM,LEVEL_1,LEVEL_2,LEVEL_3,LEVEL_4,LEVEL_5,...,STORES_WITH_ADS,STORE_OUTLETS,FEATURE_RATE,SPECIAL_RATE,ACTIVITY_INDEX,LOCALLY_GROWN_PERCENTAGE,WEIGHTED_AVERAGE_PRICE,PRICE_LOW,PRICE_HIGH,PRODUCT_QUALITY
0,RETAIL,WEEKLY,"DES MOINES, IA",30-AUG-24,POULTRY,EGGS,,LIQUID,,,...,,5500.0,,,65.0,,,,,
1,RETAIL,WEEKLY,"DES MOINES, IA",30-AUG-24,POULTRY,EGGS,,LIQUID,,,...,,5500.0,1.2,,,,,,,
2,RETAIL,WEEKLY,"DES MOINES, IA",30-AUG-24,POULTRY,EGGS,,LIQUID,,,...,,29200.0,,,65.0,,,,,
3,RETAIL,WEEKLY,"DES MOINES, IA",30-AUG-24,POULTRY,EGGS,,LIQUID,,,...,,29200.0,0.2,,,,,,,
4,RETAIL,WEEKLY,"DES MOINES, IA",30-AUG-24,POULTRY,EGGS,,SHELL,ALL SHELL,,...,,100.0,,11.0,,,,,,


In [7]:
# Get more detail on what columns are in the df
usda_ams_retail_df.columns

Index(['LEVEL_OF_TRADE', 'FREQUENCY', 'ISSUING_OFFICE', 'REPORT_DATE',
       'PROGRAM', 'LEVEL_1', 'LEVEL_2', 'LEVEL_3', 'LEVEL_4', 'LEVEL_5',
       'ORGANIC', 'SPECIALTY', 'UNIT', 'REGION', 'STORES_WITH_ADS',
       'STORE_OUTLETS', 'FEATURE_RATE', 'SPECIAL_RATE', 'ACTIVITY_INDEX',
       'LOCALLY_GROWN_PERCENTAGE', 'WEIGHTED_AVERAGE_PRICE', 'PRICE_LOW',
       'PRICE_HIGH', 'PRODUCT_QUALITY'],
      dtype='object')

In [12]:
# I'm seeing a lot of NaNs.  What % of each column is NaN?
nan_percentage = usda_ams_retail_df.isna().sum() / len(usda_ams_retail_df) * 100
print(nan_percentage)

LEVEL_OF_TRADE               0.000000
FREQUENCY                    0.000000
ISSUING_OFFICE               0.000000
REPORT_DATE                  0.000000
PROGRAM                      0.000000
LEVEL_1                      0.097223
LEVEL_2                      8.058644
LEVEL_3                     66.054698
LEVEL_4                     66.778009
LEVEL_5                     87.909288
ORGANIC                     56.398690
SPECIALTY                   58.290505
UNIT                        11.750140
REGION                       0.000000
STORES_WITH_ADS             11.854112
STORE_OUTLETS               88.249860
FEATURE_RATE                93.781300
SPECIAL_RATE                97.072425
ACTIVITY_INDEX              93.617837
LOCALLY_GROWN_PERCENTAGE    99.917057
WEIGHTED_AVERAGE_PRICE      11.750159
PRICE_LOW                   33.093714
PRICE_HIGH                  33.048509
PRODUCT_QUALITY             44.232302
dtype: float64


### Takeaways:
* This dataset appears to have the requisite raw data. 
* This dataset is large, but it will benefit from dropping unncessary columns and filtering out rows with empty values.

## B. Inflation Trend Data

This dataset was taken from the Bureau of Labor Statistics Consumer Price Index (CPI) for [All Urban Consumers](https://www.bls.gov/cpi/tables/supplemental-files/). 

In [9]:
# Create a dataframe from the BLS's CPI for All Urban Consumers
bls_cpi_df = pd.read_excel('cpi-u-202412.xlsx')

In [10]:
# Get an overview of the dataframe with a .head()
bls_cpi_df.head()

Unnamed: 0.1,Unnamed: 0,"Consumer Price Index for All Urban Consumers (CPI-U): U.S. city average, by expenditure category, December 2024",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43
0,,"[1982-84=100, unless otherwise noted]",,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Indent Level,Expenditure category,Relative\nimportance\nNov.\n2024,Unadjusted indexes,Unadjusted indexes,Unadjusted indexes,Unadjusted indexes,Unadjusted indexes,Unadjusted indexes,Unadjusted indexes,...,Seasonally adjusted percent change,Seasonally adjusted percent change,One Month,One Month,One Month,One Month,Twelve Month,Twelve Month,Twelve Month,Twelve Month
3,,,,Dec.\n2023,Jan.\n2024,Feb.\n2024,Mar.\n2024,Apr.\n2024,May\n2024,Jun.\n2024,...,Oct.\n2024-\nNov.\n2024,Nov.\n2024-\nDec.\n2024,Seasonally adjusted effect on All Items\nNov. ...,"Standard error, median price change(2)",Largest (L) or Smallest (S) seasonally adjuste...,Largest (L) or Smallest (S) seasonally adjuste...,Unadjusted effect on All Items\nDec. 2023-\nDe...,"Standard error, median price change(2)",Largest (L) or Smallest (S) unadjusted change ...,Largest (L) or Smallest (S) unadjusted change ...
4,,,,,,,,,,,...,,,,,Date,Percent change,,,Date,Percent change
