In [16]:
import pandas as pd
import os

# ==== STEP 1: Check working directory ====
print("📂 Current working directory:", os.getcwd())
print("📂 Files in current directory:", os.listdir())

# ==== STEP 2: Define file paths ====
raw_csv_path = "data_raw/treasury_data_raw.csv"   # raw data (from your scraper)
clean_csv_path = "data_clean/treasury_data_clean.csv"  # cleaned output

# ==== STEP 3: Confirm file exists ====
if not os.path.exists(raw_csv_path):
    raise FileNotFoundError(f"❌ Raw CSV not found at: {raw_csv_path}")

# ==== STEP 4: Load the data ====
df = pd.read_csv(raw_csv_path)
print("✅ File loaded successfully!")
print("📊 Shape before cleaning:", df.shape)

# ==== STEP 5: Standardize column names ====
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# ==== STEP 6: Handle missing values ====
missing_summary = df.isnull().sum()
print("\n🔍 Missing values per column:\n", missing_summary)

for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

# ==== STEP 7: Remove duplicates ====
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"\n🗑️ Removed {before - after} duplicate rows")

# ==== STEP 8: Save cleaned data ====
df.to_csv(clean_csv_path, index=False, encoding="utf-8")
print(f"\n💾 Cleaned file saved to: {clean_csv_path}")
print("📊 Shape after cleaning:", df.shape)

# ==== STEP 9: Preview ====
print("\n👀 Preview of cleaned data:")
print(df.head())


📂 Current working directory: c:\Users\DELL\OneDrive - TFS Chartered Accountants\Documents\GitHub\group5-ministry-of-finance-scraping\data_clean
📂 Files in current directory: ['Analysis', 'data_cleaning.ipynb']


FileNotFoundError: ❌ Raw CSV not found at: data_raw/treasury_data_raw.csv

In [17]:
import pandas as pd
import os

# ==== SETTINGS ====
# Path to your raw CSV file (inside data_raw folder)
raw_csv_path = "./data_raw/treasury_data_raw.csv"  

# Save cleaned file into data_clean folder
clean_csv_path = "./data_clean/treasury_data_clean.csv"

# ==== STEP 1: Confirm file exists ====
if not os.path.exists(raw_csv_path):
    raise FileNotFoundError(f"❌ Raw CSV not found at: {raw_csv_path}")
else:
    print(f"✅ Found raw CSV at: {raw_csv_path}")

# ==== STEP 2: Load the data ====
df = pd.read_csv(raw_csv_path)
print("✅ File loaded successfully!")
print("📊 Shape before cleaning:", df.shape)

# ==== STEP 3: Standardize column names ====
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# ==== STEP 4: Handle missing values ====
missing_summary = df.isnull().sum()
print("\n🔍 Missing values per column:\n", missing_summary)

for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

# ==== STEP 5: Remove duplicates ====
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"\n🗑️ Removed {before - after} duplicate rows")

# ==== STEP 6: Save cleaned data ====
df.to_csv(clean_csv_path, index=False, encoding="utf-8")
print(f"\n💾 Cleaned file saved to: {clean_csv_path}")
print("📊 Shape after cleaning:", df.shape)

# ==== STEP 7: Preview ====
print("\n👀 Preview of cleaned data:")
print(df.head())


FileNotFoundError: ❌ Raw CSV not found at: ./data_raw/treasury_data_raw.csv

In [18]:
import pandas as pd
import os

# ==== SETTINGS ====
# Go up one folder, then into data_raw
raw_csv_path = "../data_raw/treasury_data_raw.csv"
clean_csv_path = "../data_clean/treasury_data_clean.csv"

# ==== STEP 1: Confirm file exists ====
if not os.path.exists(raw_csv_path):
    raise FileNotFoundError(f"❌ Raw CSV not found at: {raw_csv_path}")
else:
    print(f"✅ Found raw CSV at: {raw_csv_path}")

# ==== STEP 2: Load the data ====
df = pd.read_csv(raw_csv_path)
print("✅ File loaded successfully!")
print("📊 Shape before cleaning:", df.shape)

# ==== STEP 3: Standardize column names ====
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# ==== STEP 4: Handle missing values ====
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

# ==== STEP 5: Remove duplicates ====
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"🗑️ Removed {before - after} duplicate rows")

# ==== STEP 6: Save cleaned data ====
df.to_csv(clean_csv_path, index=False, encoding="utf-8")
print(f"\n💾 Cleaned file saved to: {clean_csv_path}")
print("📊 Shape after cleaning:", df.shape)

# ==== STEP 7: Preview ====
print("\n🔍 Preview of cleaned data:")
print(df.head())


FileNotFoundError: ❌ Raw CSV not found at: ../data_raw/treasury_data_raw.csv

In [19]:
import os
import pandas as pd

# Build absolute path safely
base_dir = os.path.dirname(os.path.abspath("__file__"))  # repo root
raw_csv_path = os.path.join(base_dir, "data_raw", "treasury_data_raw.csv")
clean_csv_path = os.path.join(base_dir, "data_clean", "treasury_data_clean.csv")

print("📂 Looking for:", raw_csv_path)

df = pd.read_csv(raw_csv_path)
print("✅ Loaded CSV with shape:", df.shape)


📂 Looking for: c:\Users\DELL\OneDrive - TFS Chartered Accountants\Documents\GitHub\group5-ministry-of-finance-scraping\data_clean\data_raw\treasury_data_raw.csv


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\DELL\\OneDrive - TFS Chartered Accountants\\Documents\\GitHub\\group5-ministry-of-finance-scraping\\data_clean\\data_raw\\treasury_data_raw.csv'