In [2]:
!pip install pandas -q

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m31m27.1 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip 

In [13]:
import pandas as pd
import json
import os

# Define the path to your dataset directory
dataset_dir = '../dataset' # Assuming 'dataset' is in the same directory as your notebook

# Define the path to the descriptions directory
descriptions_dir = os.path.join(dataset_dir, 'descriptions')

def remove_undescribed_columns_and_log(csv_filepath, description_filepath):
    """
    Removes columns from a CSV file that are not described in its corresponding JSON file,
    and logs the removed columns, retained column count, and described column count.

    Args:
        csv_filepath (str): The full path to the CSV file.
        description_filepath (str): The full path to the JSON description file.
    """
    print(f"--- Processing: {os.path.basename(csv_filepath)} ---")
    print(f"Description file: {os.path.basename(description_filepath)}")

    try:
        # --- MODIFIED PART: More robust CSV reading with error handling for column mismatch ---
        # Try reading with default (comma) first
        try:
            df = pd.read_csv(csv_filepath)
        except Exception as e:
            print(f"Initial read_csv with default settings failed: {e}. Trying with 'engine=python'...")
            # If default fails, try with engine='python'
            df = pd.read_csv(csv_filepath)
            print("Successfully read with engine='python'.")

        original_columns = df.columns.tolist()
        num_columns_from_df = len(original_columns)
        print(f"Original columns in CSV ({num_columns_from_df}): {original_columns}")

        # Load the JSON description file
        with open(description_filepath, 'r') as f:
            description_data = json.load(f)

        described_columns = []
        if "schema" in description_data and "fields" in description_data["schema"]:
            for field in description_data["schema"]["fields"]:
                if "name" in field:
                    described_columns.append(field["name"])
        
        num_described_columns = len(described_columns)
        print(f"Described columns from JSON ({num_described_columns}): {described_columns}")

        # **IMPORTANT DIAGNOSTIC STEP FOR YOUR ISSUE**
        if num_columns_from_df != num_described_columns and num_columns_from_df != num_described_columns + (num_columns_from_df - num_described_columns):
             # This means pandas read more columns than expected, and it's not just extra columns
             print(f"\n!!! CRITICAL COLUMN MISMATCH DETECTED IN '{os.path.basename(csv_filepath)}' !!!")
             print(f"   Pandas read {num_columns_from_df} columns, but JSON describes {num_described_columns}.")
             print("   This often indicates an incorrect delimiter, quoting issue, or malformed header in the CSV.")
             print("   Please inspect the CSV header manually.")
             # You might want to stop processing here or try to infer delimiter
             # For now, we'll proceed but be aware of potential issues.
             # You could add: return # to stop if you want to fix manually
        
        # Identify columns to drop
        columns_to_drop = [col for col in original_columns if col not in described_columns]

        if columns_to_drop:
            print(f"Columns removed from CSV: {columns_to_drop}")
            df_cleaned = df.drop(columns=columns_to_drop, errors='ignore') # Use errors='ignore' to prevent error if a column somehow doesn't exist
            
            retained_columns = df_cleaned.columns.tolist()
            print(f"Columns retained in CSV ({len(retained_columns)}): {retained_columns}")

            # Double check: Compare retained columns count with described columns count
            if len(retained_columns) == num_described_columns:
                print(f"Verification: Number of retained columns matches described columns count ({len(retained_columns)}).")
            else:
                print(f"Verification WARNING: Retained columns count ({len(retained_columns)}) does NOT match described columns count ({num_described_columns}).")
                missing_described_columns_in_csv = [col for col in described_columns if col not in original_columns]
                if missing_described_columns_in_csv:
                    print(f"  (Reason: Some described columns were missing from the original CSV to begin with: {missing_described_columns_in_csv})")
                
                extra_columns_in_retained = [col for col in retained_columns if col not in described_columns]
                if extra_columns_in_retained:
                    print(f"  (Reason: Some extra columns were retained that were not described: {extra_columns_in_retained})")

            # Save the modified DataFrame back to the CSV file
            df_cleaned.to_csv(csv_filepath, index=False)
            print(f"Successfully cleaned and saved: {os.path.basename(csv_filepath)}\n")
        else:
            retained_columns = original_columns # No columns were dropped, so all original columns are retained
            print("No undescribed columns found. CSV file remains unchanged.")
            print(f"Columns retained in CSV ({len(retained_columns)}): {retained_columns}")
            
            if len(retained_columns) == num_described_columns:
                print(f"Verification: Number of retained columns matches described columns count ({len(retained_columns)}).\n")
            else:
                print(f"Verification WARNING: Retained columns count ({len(retained_columns)}) does NOT match described columns count ({num_described_columns}).\n")
                missing_described_columns_in_csv = [col for col in described_columns if col not in original_columns]
                if missing_described_columns_in_csv:
                    print(f"  (Reason: Some described columns were missing from the original CSV: {missing_described_columns_in_csv})\n")


    except FileNotFoundError:
        print(f"Error: One of the files not found. CSV: {csv_filepath}, JSON: {description_filepath}\n")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {description_filepath}. Please check file format.\n")
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file '{csv_filepath}' is empty or contains only a header.\n")
    except Exception as e:
        print(f"An unexpected error occurred while processing {csv_filepath}: {e}\n")

# Iterate through the description JSON files
for description_file in os.listdir(descriptions_dir):
    if description_file.endswith('.json'):
        description_name = os.path.splitext(description_file)[0] # e.g., Browse_mobile_behaviour_table_details
        
        # Construct the corresponding CSV file name
        csv_name = description_name.replace('_table_details', '') + '.csv'

        csv_filepath = os.path.join(dataset_dir, csv_name)
        description_filepath = os.path.join(descriptions_dir, description_file)

        # Check if the corresponding CSV file exists before processing
        if os.path.exists(csv_filepath):
            remove_undescribed_columns_and_log(csv_filepath, description_filepath)
        else:
            print(f"--- Skipping: {csv_name} (corresponding CSV not found at {csv_filepath}) ---\n")

print("--- All CSV files processed. ---")

--- Processing: streaming_mobile_behaviour.csv ---
Description file: streaming_mobile_behaviour_table_details.json
Original columns in CSV (25): ['DATE_ID', 'IMEI', 'RAT', 'RAN_NE_USER_IP', 'LAST_SAI_CGI_ECGI', 'APP_NAME', 'CELL_NAME', 'PROTOCOL', 'HOST', 'TOTAL_EVENT_COUNT', 'TOTAL_EVENT_DURATION_S', 'PEAK_DL_THROUGHPUT_MBPS', 'TOTAL_DL_THROUGHPUT_BYTES', 'TOTAL_DL_THROUGHPUT_DURATION_NS', 'TOTAL_DL_TRAFFIC_BYTES', 'TOTAL_DL_RTT_COUNT', 'TOTAL_DL_RTT_DURATION_MS', 'PEAK_UL_THROUGHPUT_MBPS', 'TOTAL_UL_THROUGHPUT_BYTES', 'TOTAL_UL_THROUGHPUT_DURATION_NS', 'TOTAL_UL_TRAFFIC_BYTES', 'TOTAL_UL_RTT_COUNT', 'TOTAL_UL_RTT_DURATION_MS', 'MSISDN_MASKED', 'IMSI_MASKED']
Described columns from JSON (25): ['DATE_ID', 'IMEI', 'RAT', 'RAN_NE_USER_IP', 'LAST_SAI_CGI_ECGI', 'APP_NAME', 'CELL_NAME', 'PROTOCOL', 'HOST', 'TOTAL_EVENT_COUNT', 'TOTAL_EVENT_DURATION_S', 'PEAK_DL_THROUGHPUT_MBPS', 'TOTAL_DL_THROUGHPUT_BYTES', 'TOTAL_DL_THROUGHPUT_DURATION_NS', 'TOTAL_DL_TRAFFIC_BYTES', 'TOTAL_DL_RTT_COUNT', 

  df = pd.read_csv(csv_filepath)


Original columns in CSV (61): ['TIMESTAMP', 'DOMAIN', 'SUBSCRIBER_DOMAIN', 'FTTH_BASEPLAN', 'CONVERGE_LINK_ID', 'CUSTOMER_ACCOUNT_ID', 'SUBSCRIBER_SEGMENT', 'FTTH_STATUS', 'CUSTOMER_GROUP', 'FTTH_SEGMENT', 'FTTH_INSTALLATION_POSTCODE', 'FTTH_INSTALLATION_CITY', 'FTTH_INSTALLATION_REGION', 'FTTH_INSTALLATION_STATE', 'FTTH_PROPERTY_TYPE', 'FTTH_NETWORK_TYPE', 'FTTH_INFRA_TYPE', 'ACTIVATION_CATEGORY', 'PRIME_SEGMENT', 'PRINCIPAL_BASEPLAN', 'BACKUP_BASEPLAN', 'WITH_MISM', 'NUMBER_OF_MISM', 'WITH_SUPPLEMENTARY', 'NUMBER_OF_SUPPLEMENTARY', 'WITH_OTHER_PRINCIPAL_PLAN', 'NUMBER_OF_OTHER_PRINCIPAL_PLAN', 'QOS_PROFILE_NAME', 'WAN_DYNAMIC_TYPE', 'TECH', 'VENDOR', 'DEVICEID', 'SERIAL_NO', 'APPLICATION_NAME', 'APPLICATION_CATEGORY', 'TOTAL_PACKETS', 'UPPACKETS', 'DOWNPACKETS', 'TOTAL_VOLUME', 'UP_BYTES', 'DOWN_BYTES', 'NEW_FLOWS', 'CUT_FLOWS', 'SEQ_FLOWS', 'ACCESS_TIME', 'TOTAL_DISCARD_PACKETS', 'DISCARD_UP_PACKETS', 'DISCARD_DOWN_PACKETS', 'TOTAL_DISCARD_BYTES', 'DISCARD_UP_BYTES', 'DISCARD_DOWN_B