In [22]:
import pandas as pd 
from pathlib import Path

In [30]:
# Paths (relative to notebooks)
RAW_DATA_DIR = Path("D:\\local_food_wastage\\data")
CLEAN_DATA_DIR = Path("D:\\local_food_wastage\\data")

In [31]:
#load raw datasets
providers = pd.read_csv(RAW_DATA_DIR / "D:\\local_food_wastage\\data\\providers_data.csv")
receivers = pd.read_csv(RAW_DATA_DIR / "D:\\local_food_wastage\\data\\receivers_data.csv" )
food_listings = pd.read_csv(RAW_DATA_DIR / "D:\\local_food_wastage\\data\\food_listings_data.csv")
claims = pd.read_csv(RAW_DATA_DIR / "D:\\local_food_wastage\\data\\claims_data.csv")

print("Original Data Shapes")
print("Providers:", providers.shape)
print("Receivers:", receivers.shape)
print("Food Listings:", food_listings.shape)
print("Claims:", claims.shape)

Original Data Shapes
Providers: (1000, 6)
Receivers: (1000, 5)
Food Listings: (1000, 9)
Claims: (1000, 5)


In [32]:
#clean Providers
# Remove duplicates by Provider_ID (if column exists)
if "Provider_ID" in providers.columns:
    providers = providers.drop_duplicates(subset="Provider_ID")
    
#Fill missing Contact
if"Contact" in providers.columns:
    providers["Contact"] = providers["Contact"].fillna("Not Provided")

# Strip extra spaces from string columns
providers = providers.applymap(lambda x: x.strip() if isinstance(x, str) else x)

providers.head(), providers.dtypes

  providers = providers.applymap(lambda x: x.strip() if isinstance(x, str) else x)


(   Provider_ID                         Name           Type  \
 0            1             Gonzales-Cochran    Supermarket   
 1            2  Nielsen, Johnson and Fuller  Grocery Store   
 2            3                 Miller-Black    Supermarket   
 3            4   Clark, Prince and Williams  Grocery Store   
 4            5               Coleman-Farley  Grocery Store   
 
                                              Address            City  \
 0  74347 Christopher Extensions\nAndreamouth, OK ...     New Jessica   
 1           91228 Hanson Stream\nWelchtown, OR 27136     East Sheena   
 2  561 Martinez Point Suite 507\nGuzmanchester, W...  Lake Jesusview   
 3     467 Bell Trail Suite 409\nPort Jesus, IA 61188     Mendezmouth   
 4  078 Matthew Creek Apt. 319\nSaraborough, MA 53978   Valentineside   
 
                 Contact  
 0       +1-600-220-0480  
 1  +1-925-283-8901x6297  
 2      001-517-295-2206  
 3      556.944.8935x401  
 4          193.714.6577  ,
 Provider_ID     

In [26]:
#clean Receivers
if "Receiver_ID" in receivers.columns:
    receivers = receivers.drop_duplicates(subset="Receiver_ID")

if "Contact" in receivers.columns:
    receivers["Contact"] = receivers["Contact"].fillna("Not Provided")

receivers = receivers.applymap(lambda x: x.strip() if isinstance(x, str) else x)

receivers.head(), receivers.dtypes

  receivers = receivers.applymap(lambda x: x.strip() if isinstance(x, str) else x)


(   Receiver_ID          Name        Type               City           Contact
 0            1  Donald Gomez     Shelter     Port Carlburgh     (955)922-5295
 1            2  Laurie Ramos  Individual         Lewisburgh      761.042.1570
 2            3  Ashley Mckee         NGO  South Randalltown  691-023-0094x856
 3            4    Erika Rose         NGO   South Shaneville        8296491111
 4            5   John Romero  Individual          Bakerport      067.491.0154,
 Receiver_ID     int64
 Name           object
 Type           object
 City           object
 Contact        object
 dtype: object)

In [33]:
#clean Food Listings
# Parse dates
if "Expiry_Date" in food_listings.columns:
    food_listings["Expiry_Date"] = pd.to_datetime(food_listings["Expiry_Date"], errors="coerce")

# Quantities to int
if "Quantity" in food_listings.columns:
    food_listings["Quantity"] = (
        pd.to_numeric(food_listings["Quantity"], errors="coerce")
        .fillna(0)
        .astype(int)
    )

# De-duplicate by Food_ID
if "Food_ID" in food_listings.columns:
    food_listings = food_listings.drop_duplicates(subset="Food_ID")

food_listings = food_listings.applymap(lambda x: x.strip() if isinstance(x, str) else x)

food_listings.head(), food_listings.dtypes


  food_listings = food_listings.applymap(lambda x: x.strip() if isinstance(x, str) else x)


(   Food_ID Food_Name  Quantity Expiry_Date  Provider_ID     Provider_Type  \
 0        1     Bread        43  2025-03-17          110     Grocery Store   
 1        2      Soup        22  2025-03-24          791     Grocery Store   
 2        3    Fruits        46  2025-03-28          478  Catering Service   
 3        4    Fruits        15  2025-03-16          930        Restaurant   
 4        5      Soup        14  2025-03-19          279        Restaurant   
 
            Location       Food_Type  Meal_Type  
 0  South Kellyville  Non-Vegetarian  Breakfast  
 1        West James  Non-Vegetarian     Dinner  
 2       Lake Regina           Vegan  Breakfast  
 3         Kellytown           Vegan      Lunch  
 4        Garciaport           Vegan     Dinner  ,
 Food_ID                   int64
 Food_Name                object
 Quantity                  int64
 Expiry_Date      datetime64[ns]
 Provider_ID               int64
 Provider_Type            object
 Location                 objec

In [34]:
#clean Claims
if "Timestamp" in claims.columns:
    claims["Timestamp"] = pd.to_datetime(claims["Timestamp"], errors="coerce")

if "Claim_ID" in claims.columns:
    claims = claims.drop_duplicates(subset="Claim_ID")

claims = claims.applymap(lambda x: x.strip() if isinstance(x, str) else x)

claims.head(), claims.dtypes


  claims = claims.applymap(lambda x: x.strip() if isinstance(x, str) else x)


(   Claim_ID  Food_ID  Receiver_ID     Status           Timestamp
 0         1      164          908    Pending 2025-03-05 05:26:00
 1         2      353          391  Cancelled 2025-03-11 10:24:00
 2         3      626          492  Completed 2025-03-21 00:59:00
 3         4       61          933  Cancelled 2025-03-04 09:08:00
 4         5      345          229    Pending 2025-03-14 15:17:00,
 Claim_ID                int64
 Food_ID                 int64
 Receiver_ID             int64
 Status                 object
 Timestamp      datetime64[ns]
 dtype: object)

In [35]:
#save cleaned CSVs
providers.to_csv(CLEAN_DATA_DIR / "providers_clean.csv", index=False)
receivers.to_csv(CLEAN_DATA_DIR / "receivers_clean.csv", index=False)
food_listings.to_csv(CLEAN_DATA_DIR / "food_listings_clean.csv", index=False)
claims.to_csv(CLEAN_DATA_DIR / "claims_clean.csv", index=False)

print("\n Cleaning complete! Clean files saved in /data:")
for f in ["providers_clean.csv","receivers_clean.csv","food_listings_clean.csv","claims_clean.csv"]:
    print("-", f)


 Cleaning complete! Clean files saved in /data:
- providers_clean.csv
- receivers_clean.csv
- food_listings_clean.csv
- claims_clean.csv
