# Step 2a: Data Cleaning, Pt. 2

In [1]:
# Import necessary libraries
import pandas as pd

## A. Grocery Market Data

In [2]:
usda_df = pd.read_csv('cleaned_usda_data.csv')

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,organic,specialty,unit,region,weighted_average_price,price_low,price_high
0,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NORTHEAST U.S.,,,
1,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NORTHEAST U.S.,,,
2,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NATIONAL,,,
3,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NATIONAL,,,
4,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,ALASKA,,,


In [None]:
# What's going on in level_2 re: NaN%?

nan_percentage = usda_df['level_2'].isna().sum() / len(usda_df['level_2']) * 100
print(nan_percentage)


8.058644049437694


### i. A deep dive on the organic column

In [None]:
# What about in organic? 

nan_percentage = usda_df['organic'].isna().sum() / len(usda_df['organic']) * 100
print(nan_percentage)


56.39868954924173


In [5]:
# What all's in organic?

usda_df['organic'].unique()

array(['NO', 'YES', nan], dtype=object)

In [None]:
# How much is 'YES' (i.e. organic)

organic_pct = (usda_df['organic'] == 'YES').sum() / len(usda_df['organic']) * 100
print(organic_pct)

3.4316235495295078


In [10]:
# Are the products marked 'YES' for organic labeled any differently in other columns?  What types of products are they?

organic_df = usda_df[usda_df['organic'] == 'YES']

organic_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,organic,specialty,unit,region,weighted_average_price,price_low,price_high
58,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,YES,NO,,ALASKA,,,
64,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,YES,NO,,SOUTHWEST U.S.,,,
65,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,YES,NO,,SOUTHWEST U.S.,,,
66,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,YES,NO,,MIDWEST U.S.,,,
67,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,YES,NO,,MIDWEST U.S.,,,


In [12]:
# Do any of my other dataframes have data on organic products, specifically? 

bls_df = pd.read_csv('cleaned_bls_cpi_data.csv')
costco_df = pd.read_csv('cleaned_costco_data.csv')

In [13]:
bls_df.head()

Unnamed: 0,product,unadjusted_percent_change_2024,seasonally_adjusted_effect_2024,unadjusted_effect_2024
0,All items,2.9,,
1,Food,2.5,0.042,0.34
2,Food at home,1.8,0.026,0.145
3,Cereals and bakery products,0.8,0.012,0.008
4,Cereals and cereal products,1.7,0.004,0.005


In [None]:
# Any organic products in this table? 

bls_organic_filtered_df = bls_df[bls_df['product'].str.contains('organic', case=False, na=False)]

bls_organic_filtered_df.head()

# Apparently not. 

Unnamed: 0,product,unadjusted_percent_change_2024,seasonally_adjusted_effect_2024,unadjusted_effect_2024


In [15]:
costco_df.head()

Unnamed: 0,sub_category,price,title
0,Bakery & Desserts,56.99,"David’s Cookies Mile High Peanut Butter Cake, ..."
1,Bakery & Desserts,159.99,"The Cake Bake Shop 8"" Round Carrot Cake (16-22..."
2,Bakery & Desserts,44.99,"St Michel Madeleine, Classic French Sponge Cak..."
3,Bakery & Desserts,39.99,"David's Cookies Butter Pecan Meltaways 32 oz, ..."
4,Bakery & Desserts,59.99,"David’s Cookies Premier Chocolate Cake, 7.2 lb..."


In [None]:
# Any organic products in this table?

costco_organic_filtered_df = costco_df[
    costco_df['sub_category'].str.contains('organic', case=False, na=False) | 
    costco_df['title'].str.contains('organic', case=False, na=False)
]

costco_organic_filtered_df.head()

# Okay, so there ARE organic products in the costco_df...

Unnamed: 0,sub_category,price,title
35,Beverages & Water,9.99,"Kirkland Signature, Organic Almond Beverage, V..."
37,Beverages & Water,21.99,"Kirkland Signature, Organic Reduced Fat Chocol..."
46,Beverages & Water,12.99,"SO Delicious, Organic Coconut Milk, 32 oz, 6-C..."
52,Beverages & Water,21.99,"Kirkland Signature, Organic Coconut Water, 33...."
57,Beverages & Water,21.99,"Horizon, Organic Whole Milk, 8 oz, 18-Count"


In [18]:
# What % of the products in the costco_df are organic?

organic_pct = costco_df['title'].str.contains('organic', case=False, na=False).sum() / len(costco_df['title']) * 100

print(organic_pct)

8.487084870848708


### ii. A deep dive on the specialty column

In [None]:
usda_df['specialty'].unique()

array(['NO ', 'YES', nan], dtype=object)

In [22]:
# What % of specialty is Nan?

nan_percentage = usda_df['specialty'].isna().sum() / len(usda_df['specialty']) * 100
print(nan_percentage)

58.29050461583623


In [26]:
# What % of specialty is 'YES'

specialty_pct = (usda_df['specialty'] == 'YES').sum() / len(usda_df['specialty']) * 100
print(specialty_pct)

7.078459441625943


In [25]:
# What kinds of products are listed as specialty? 

specialty_df = usda_df[usda_df['specialty'] == 'YES']

specialty_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,organic,specialty,unit,region,weighted_average_price,price_low,price_high
52,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,NO,YES,,HAWAII,,,
53,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,NO,YES,,HAWAII,,,
59,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,NO,YES,,ALASKA,,,
74,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,NO,YES,,NORTHWEST U.S.,,,
75,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,NO,YES,,NORTHWEST U.S.,,,


### Takeaways: 
* I think I should filter everything marked as 'organic', which is 8.49%, from the usda_df.
* I think I should filter everything marked as 'specialty', which is 7.08%, from the usda_df.
* I think I should then drop the organic and specialty columns. 

In [None]:
# Filtering out organic & specialty 'YES'

usda_df = usda_df[(usda_df['organic'] != 'YES') & (usda_df['specialty'] != 'YES')]

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,organic,specialty,unit,region,weighted_average_price,price_low,price_high
0,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NORTHEAST U.S.,,,
1,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NORTHEAST U.S.,,,
2,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NATIONAL,,,
3,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,NATIONAL,,,
4,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,NO,NO,,ALASKA,,,


In [29]:
# Dropping now-useless organic & specialty columns

usda_df.drop(columns=['organic', 'specialty'], axis=1, inplace=True)

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
0,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NORTHEAST U.S.,,,
1,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NORTHEAST U.S.,,,
2,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NATIONAL,,,
3,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NATIONAL,,,
4,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,ALASKA,,,


### iii. The level_2 column

In [31]:
# What % is NaN?

nan_percentage = usda_df['level_2'].isna().sum() / len(usda_df['level_2']) * 100
print(nan_percentage)

6.819216736542158


In [None]:
# Should I filter out everythign that's Nan here?

usda_df['level_2'].unique()

# I actually think this information is necessary enough that it might be a good idea to filter out NaNs

array([nan, 'PREPARED', 'FRESH/FROZEN/PREPARED', 'FRESH', 'FROZEN',
       'FRESH/FROZEN', 'FILET MIGNON', 'TENDERLOIN', 'FLANK STEAK',
       'PORTERHOUSE STEAK', 'BNLS RIBEYE STEAK',
       'BNLS NEW YORK STRIP STEAK', 'BNLS RIBEYE ROAST',
       'BONE-IN STRIP STEAK', 'BONE-IN RIBEYE STEAK',
       'BNLS TOP SIRLOIN STEAK', 'SKIRT STEAK', 'BNLS SIRLOIN STEAK',
       'BRISKET, FLAT', 'T-BONE STEAK', 'LOIN, BNLS', 'BEEF SHORT RIBS',
       'TOP ROUND STEAK', 'BNLS TOP SIRLOIN ROAST', 'SIRLOIN STEAK',
       'MINUTE/CUBE STEAKS', 'SIRLOIN TIP STEAK', 'EYE OF ROUND STEAK',
       'STEW MEAT', 'TRI-TIP', 'TOP ROUND ROAST', 'BEEF PATTIES',
       'BOTTOM ROUND ROAST', 'SIRLOIN TIP ROAST', 'LONDON BROIL',
       'EYE OF ROUND ROAST', 'GROUND SIRLOIN', 'GROUND BEEF 90% OR MORE',
       'CHUCK/SHLDR/ARM ROAST', 'BOTTOM ROUND STEAK',
       'CHUCK/SHLDR/ARM STEAK', 'RUMP ROAST', 'GROUND ROUND', 'BRISKET',
       'BEEF BACKRIBS', 'GROUND BEEF 80-89%', 'GROUND CHUCK',
       'GROUND BEEF 70-79

In [34]:
# For example, what kind of eggs are listed that don't have any level_2 information?

eggs_check_df = usda_df[(usda_df['level_1'].str.lower() == 'eggs') & (usda_df['level_2'].isna())]

eggs_check_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
0,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NORTHEAST U.S.,,,
1,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NORTHEAST U.S.,,,
2,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NATIONAL,,,
3,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,NATIONAL,,,
4,"DES MOINES, IA",2024-08-30,POULTRY,EGGS,,,ALASKA,,,


In [36]:
# I think it makes the most sense to filter out rows with level_2 NaN 

usda_df = usda_df[~usda_df['level_2'].isna()]

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
28,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,,HAWAII,,,
29,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,,HAWAII,,,
30,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,FRESH/FROZEN/PREPARED,,HAWAII,,,
31,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,FRESH/FROZEN/PREPARED,,HAWAII,,,
32,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,,ALASKA,,,


### iv. The unit column

In [37]:
nan_percentage = usda_df['unit'].isna().sum() / len(usda_df['unit']) * 100
print(nan_percentage)

6.678188798372739


In [38]:
# Probably helpful to only have products listed where some information is also given about how they're packaged / sold
# so that I can correctly compare to other datasets.

usda_df = usda_df[~usda_df['unit'].isna()]

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
167,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,HAWAII,7.12,7.12,7.12
168,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,HAWAII,4.99,4.99,4.99
170,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,ALASKA,6.96,4.99,9.49
171,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,ALASKA,3.48,3.48,3.48
172,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NORTHWEST U.S.,7.11,6.99,7.12


### v. The region column

In [None]:
# Do all products have region data?

nan_percentage = usda_df['region'].isna().sum() / len(usda_df['region']) * 100
print(nan_percentage)

# Answer: yes, they do. 

0.0


### vi. The weighted_average_price column

In [40]:
# Do all products have a weighted average price?

nan_percentage = usda_df['weighted_average_price'].isna().sum() / len(usda_df['weighted_average_price']) * 100
print(nan_percentage)

1.6472469972746298e-05


In [41]:
# Think it makes sense to get rid of anything that doesn't have a weighted average price, as that's what i'll
# probably most heavily rely on for analysis... 

usda_df = usda_df[~usda_df['weighted_average_price'].isna()]

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
167,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,HAWAII,7.12,7.12,7.12
168,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,HAWAII,4.99,4.99,4.99
170,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,ALASKA,6.96,4.99,9.49
171,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,ALASKA,3.48,3.48,3.48
172,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NORTHWEST U.S.,7.11,6.99,7.12


In [None]:
# Do either the price_low or price_high columns have any NaNs? 

nan_low_pct = usda_df['price_low'].isna().sum() / len(usda_df['price_low']) * 100
print(nan_low_pct)

nan_high_pct = usda_df['price_high'].isna().sum() / len(usda_df['price_high']) * 100
print(nan_high_pct)

# Seems weird that those are exactly the same... 

24.299911674601457
24.299911674601457


In [47]:
price_check_df = usda_df[(usda_df['price_low'].isna())]

price_check_df.head(100)

# It appears that the reason the Nan %s for price_low and price_high are exactly the same is that if a product doesn't
# have data for one, it won't have it for the other, either. 

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
260,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NATIONAL,6.94,,
261,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,NATIONAL,4.96,,
262,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,NATIONAL,4.36,,
263,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,,NATIONAL,6.99,,
264,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,,NATIONAL,4.29,,
...,...,...,...,...,...,...,...,...,...,...
474,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY REGULAR,MIDWEST U.S.,9.28,,
475,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY VALUE,MIDWEST U.S.,3.56,,
476,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY REGULAR,MIDWEST U.S.,4.17,,
477,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY,MIDWEST U.S.,3.99,,


In [50]:
# Off-topic, but I'm seeing what appear to be empty cells in the unit column.  Does this fix it? 

usda_df = usda_df[(usda_df['unit'].str.strip() != '')]

usda_df.head()

# Appears to... 

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
167,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,HAWAII,7.12,7.12,7.12
168,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,HAWAII,4.99,4.99,4.99
170,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,ALASKA,6.96,4.99,9.49
171,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,ALASKA,3.48,3.48,3.48
172,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NORTHWEST U.S.,7.11,6.99,7.12


In [51]:
# Okay... back to price_low / price_high... 

price_check_df = usda_df[(usda_df['price_low'].isna())]

price_check_df.head(100)

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price,price_low,price_high
260,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NATIONAL,6.94,,
261,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,NATIONAL,4.96,,
262,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,NATIONAL,4.36,,
275,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,FRESH,TRAY,NATIONAL,5.99,,
276,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,FRESH,TRAY,NATIONAL,4.82,,
...,...,...,...,...,...,...,...,...,...,...
499,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY VALUE,SOUTHEAST U.S.,2.19,,
500,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY REGULAR,SOUTHEAST U.S.,3.36,,
501,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FRESH,TRAY REGULAR,SOUTHEAST U.S.,2.80,,
504,"DES MOINES, IA",2024-08-30,POULTRY,CHICKEN,FROZEN,BAGGED,NORTHEAST U.S.,2.49,,


In [53]:
# Honestly, I think a weighted average is fine data for this analysis.  I'll drop the other 2.as_integer_ratio

usda_df.drop(columns=['price_low', 'price_high'], axis=1, inplace=True)

usda_df.head()

Unnamed: 0,issuing_office,report_date,program,level_1,level_2,unit,region,weighted_average_price
167,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,HAWAII,7.12
168,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,HAWAII,4.99
170,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,ALASKA,6.96
171,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,7-10 OUNCES,ALASKA,3.48
172,"DES MOINES, IA",2024-08-30,POULTRY,TURKEY,PREPARED,16 OUNCES,NORTHWEST U.S.,7.11


### vii. The regions column, revisted

In [None]:
# How many different regions are listed?

usda_df['region'].nunique()

# 9?  That's actually really manageable.  *thumbs up*

9

### viii. The report_date column, revisted

In [56]:
# What are the earliest and latest dates in this dataset, again? 

usda_df['report_date'] = pd.to_datetime(usda_df['report_date'])
earleist_date = usda_df['report_date'].min()
latest_date = usda_df['report_date'].max()

print(earleist_date)
print(latest_date)


2020-01-03 00:00:00
2024-08-30 00:00:00


In [57]:
# How many times does each date show up (i.e. are we getting consistent amounts data over time?)

date_counts = usda_df['report_date'].value_counts()

print(date_counts)

report_date
2021-01-22    59629
2021-03-12    58017
2020-02-14    56889
2020-02-07    56514
2020-02-28    56372
              ...  
2024-08-16     5731
2024-08-23     4905
2021-01-20     4900
2021-01-11     4459
2024-08-30     3920
Name: count, Length: 251, dtype: int64
