In [7]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler 
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [11]:
providers = pd.read_csv("providers_data.csv")
receivers = pd.read_csv("receivers_data.csv") 
listings = pd.read_csv("food_listings_data.csv") 
claims = pd.read_csv("claims_data.csv")

In [29]:
for name, df in [('claims', claims), ('listings', listings), ('providers', providers), ('receivers', receivers)]:
    print(f"\n=== {name} columns ({len(df.columns)}) ===")
    print(df.columns.tolist())
    print(f"sample:\n{df.head(3)}")


=== claims columns (5) ===
['Claim_ID', 'Food_ID', 'Receiver_ID', 'Status', 'Timestamp']
sample:
   Claim_ID  Food_ID  Receiver_ID     Status        Timestamp
0         1      164          908    Pending    3/5/2025 5:26
1         2      353          391  Cancelled  3/11/2025 10:24
2         3      626          492  Completed   3/21/2025 0:59

=== listings columns (9) ===
['Food_ID', 'Food_Name', 'Quantity', 'Expiry_Date', 'Provider_ID', 'Provider_Type', 'Location', 'Food_Type', 'Meal_Type']
sample:
   Food_ID Food_Name  Quantity Expiry_Date  Provider_ID     Provider_Type  \
0        1     Bread        43   3/17/2025          110     Grocery Store   
1        2      Soup        22   3/24/2025          791     Grocery Store   
2        3    Fruits        46   3/28/2025          478  Catering Service   

           Location       Food_Type  Meal_Type  
0  South Kellyville  Non-Vegetarian  Breakfast  
1        West James  Non-Vegetarian     Dinner  
2       Lake Regina           Vegan  B

In [31]:
def normalize_cols(df):
    # strip whitespace, lower-case, replace spaces and hyphens with underscore
    df = df.copy()
    df.columns = (
        df.columns
          .astype(str)
          .str.strip()
          .str.lower()
          .str.replace(r'[\s\-]+', '_', regex=True)
    )
    return df

claims = normalize_cols(claims)
listings = normalize_cols(listings)
providers = normalize_cols(providers)
receivers = normalize_cols(receivers)

# check again
print("Normalized columns:")
print("claims:", claims.columns.tolist())
print("listings:", listings.columns.tolist())


Normalized columns:
claims: ['claim_id', 'food_id', 'receiver_id', 'status', 'timestamp']
listings: ['food_id', 'food_name', 'quantity', 'expiry_date', 'provider_id', 'provider_type', 'location', 'food_type', 'meal_type']


In [33]:
def find_candidates(df, keyword):
    return [c for c in df.columns if keyword in c]

print("Possible food-id columns:")
print("claims:", find_candidates(claims, 'food'))
print("listings:", find_candidates(listings, 'food'))
print("providers (id-like):", find_candidates(providers, 'id'))
print("receivers (id-like):", find_candidates(receivers, 'id'))


Possible food-id columns:
claims: ['food_id']
listings: ['food_id', 'food_name', 'food_type']
providers (id-like): ['provider_id']
receivers (id-like): ['receiver_id']


In [35]:
# assume normalize_cols() already applied and any necessary renaming done
df = (
    claims
    .merge(listings, on="food_id", how="left", indicator=True)
)
print("claims+listings merge counts:\n", df['_merge'].value_counts())

# optional: drop indicator then merge providers and receivers
df = df.drop(columns=['_merge'])
df = df.merge(providers, on="provider_id", how="left")
df = df.merge(receivers, on="receiver_id", how="left")
print("Final merged df shape:", df.shape)


claims+listings merge counts:
 _merge
both          1000
left_only        0
right_only       0
Name: count, dtype: int64
Final merged df shape: (1000, 22)


In [37]:
num_cols = df.select_dtypes(include=[np.number]).columns 
imputer_num = SimpleImputer(strategy="median") 
df[num_cols] = imputer_num.fit_transform(df[num_cols])

In [39]:
cat_cols = df.select_dtypes(include=["object"]).columns 
imputer_cat = SimpleImputer(strategy="most_frequent") 
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])


In [41]:
if "expiry_date" in df.columns: 
    df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce") 
    df["days_to_expiry"] = (df["expiry_date"] - pd.Timestamp.today()).dt.days.fillna(0)

In [53]:
categorical_features = ["status", "food_type", "meal_type", "provider_type", "city"]


In [55]:
from sklearn.preprocessing import LabelEncoder

categorical_features = ["status", "food_type", "meal_type", "provider_type", "city"]
label_encoders = {}

for col in categorical_features:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le


In [63]:
if "food_type" in df.columns:
    onehot = pd.get_dummies(df["food_type"], prefix="food")
    df = pd.concat([df, onehot], axis=1)
    df.drop(columns=["food_type"], inplace=True)


In [67]:
scale_cols = ["quantity", "days_to_expiry"] 
scaler = StandardScaler() 
df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [71]:
if "status" in df.columns: 
    df["target"] = (df["status"] == "Completed").astype(int) 
    y = df["target"]
    X = df.drop(columns=["target", "status", "expiry_date", "timestamp"]) 
print("Features shape:", X.shape)

Features shape: (1000, 22)


In [75]:
if y is not None: 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
    print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
else: 
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

Train shape: (800, 22) Test shape: (200, 22)


In [81]:
import os

# Create the folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Now save the CSV files
X_train.to_csv("data/cleaned_train.csv", index=False)
X_test.to_csv("data/cleaned_test.csv", index=False)

print("✅ Preprocessing complete. Cleaned datasets saved.")


✅ Preprocessing complete. Cleaned datasets saved.
