In [156]:
import pandas as pd
import os
import logging

In [157]:
# Loading env variables and settting log level
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
RAW_DATA_PATH = os.getenv("RAW_DATA_PATH", "../data/raw/aggregated_data.csv")
CLEANED_DATA_PATH = os.getenv("CLEANED_DATA_PATH", "../data/processed/cleaned_aggregated_data.csv")
numeric_columns = ['male', 'female', 'other', 'total']
discrepency_list = ['state_name', 'district_name', 'disability', 'age_group', 'male', 'female', 'other', 'total', 'calculated_total']
text_columns_to_standardize = ['state_name', 'district_name', 'disability']

In [158]:
# Read file function
def read_csv_file(file_path):
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Loaded data from {file_path} with shape {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Failed to load raw data: {e}")
        return pd.DataFrame()

In [161]:
# Reading file from raw path
df = read_csv_file(RAW_DATA_PATH)
if df.empty:
    raise SystemExit("File not found")


2025-06-14 19:00:12,362 - INFO - Loaded data from ../data/raw/aggregated_data.csv with shape (3309, 8)


In [162]:
# Visualizing the data
logging.info(f"Missing Values in file per header:\n{df.isnull().sum()}")

2025-06-14 19:00:21,993 - INFO - Missing Values in file per header:
State Name          0
District Name       0
Disability          0
Age Group           0
Male              458
Female            748
Other            3190
Total             296
dtype: int64


In [163]:
# Standardizing column names: lowercase, replace spaces with underscores, remove special characters
def standardize_column_names(df):
    lowered_column_names = df.columns.str.lower()
    underscored_columns_names = lowered_column_names.str.replace(" " , "_")
    standarized_columns = underscored_columns_names.str.replace('[^a-z0-9_]', '', regex=True)
    logging.info(f"columns after standardization:\n{standarized_columns}")
    df.columns = standarized_columns
    return df

In [164]:
# calling function to standardize column
df = standardize_column_names(df)

2025-06-14 19:01:55,096 - INFO - columns after standardization:
Index(['state_name', 'district_name', 'disability', 'age_group', 'male',
       'female', 'other', 'total'],
      dtype='object')


In [165]:
# Cleaning 'age_group' column: remove quotes and standardize format
def clean_age_group_column(df):
    df['age_group'] = df['age_group'].str.strip("'")
    logging.info(f"clean age-group column:\n{df['age_group']}")
    return df

In [166]:
# Calling function to clean 
df = clean_age_group_column(df)


2025-06-14 19:02:21,184 - INFO - clean age-group column:
0        6-15
1       15-35
2       35-60
3       60-80
4         80+
        ...  
3304    60-80
3305      0-6
3306     6-15
3307    15-35
3308    35-60
Name: age_group, Length: 3309, dtype: object


In [167]:
# Converting numeric columns to integers, ensuring float to integer conversion and default as 0
def standarize_numeric_columns(numeric_columns, df):
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        logging.info(f"{col} column standardized")
    return df

In [168]:
# Calling function to standardize numeric columns
df = standarize_numeric_columns(numeric_columns, df)

2025-06-14 19:04:42,917 - INFO - male column standardized
2025-06-14 19:04:42,919 - INFO - female column standardized
2025-06-14 19:04:42,921 - INFO - other column standardized
2025-06-14 19:04:42,922 - INFO - total column standardized


In [172]:
# Validating 'total' column: check if total = male + female + other
def find_and_fix_discrepencies(total_clm, calculated_total_clm, df, discrepency_list):
    discrepancies = df[df[total_clm] != df[calculated_total_clm]]
    if not discrepancies.empty:
        logging.info(f"Rows with Total discrepancies:\n{discrepancies[discrepency_list]}")
        df.loc[df['total'] != df['calculated_total'], 'total'] = df['calculated_total']
        df = df.drop(columns=['calculated_total'])
    logging.info("Fixed discrepencies")
    return df

In [169]:
# Calling function to fix discrepencies
df['calculated_total'] = df['male'] + df['female'] + df['other']
df = find_and_fix_discrepencies('total', 'calculated_total', df, discrepency_list)


2025-06-14 19:06:52,822 - INFO - Fixed discrepencies


In [173]:
# Remove duplicates
def remove_duplicates(df):
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        logging.info(f"Found {duplicates} duplicate rows. Removing them.")
        df = df.drop_duplicates()
    return df

In [174]:
# Calling function to remove duplicates
df = remove_duplicates(df)

In [147]:
# Fix invalid rows in dataframe
def fix_invalid_rows(invalid_rows, df):
    if not invalid_rows.empty:
        logging.info("Rows with negative values: {invalid_rows}")
        df = df[(df['male'] >= 0) & (df['female'] >= 0) & (df['other'] >= 0) & (df['total'] >= 0)]
    return df

In [148]:
# Validating for negative or invalid values in numeric columns
invalid_rows = df[(df['male'] < 0) | (df['female'] < 0) | (df['other'] < 0) | (df['total'] < 0)]
df = fix_invalid_rows(invalid_rows, df)

In [175]:
# standardize on text columns
def standardize_text_columns(text_columns, df):
    for col in text_columns:
        df[col] = df[col].str.strip().str.title()
    return df

In [176]:
# Convert column to categorical variable
def convert_to_categorical(column_name, df):
    df[column_name] = df[column_name].astype('category')
    return df

In [177]:
# Standardizing text columns: trim whitespace , convert to title case for consistency and fixing data types for columns
df = standardize_text_columns(text_columns_to_standardize,df)
df = convert_to_categorical('age_group', df)
logging.info(f"Data types after cleanup: {df.dtypes}")


2025-06-14 19:14:12,795 - INFO - Data types after cleanup: state_name            object
district_name         object
disability            object
age_group           category
male                   int64
female                 int64
other                  int64
total                  int64
calculated_total       int64
dtype: object


In [178]:
# Visualizing the data
logging.info(f"Missing Values in file per header:\n{df.isnull().sum()}")
logging.info(df.info())

2025-06-14 19:14:24,736 - INFO - Missing Values in file per header:
state_name          0
district_name       0
disability          0
age_group           0
male                0
female              0
other               0
total               0
calculated_total    0
dtype: int64
2025-06-14 19:14:24,749 - INFO - None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3309 entries, 0 to 3308
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   state_name        3309 non-null   object  
 1   district_name     3309 non-null   object  
 2   disability        3309 non-null   object  
 3   age_group         3309 non-null   category
 4   male              3309 non-null   int64   
 5   female            3309 non-null   int64   
 6   other             3309 non-null   int64   
 7   total             3309 non-null   int64   
 8   calculated_total  3309 non-null   int64   
dtypes: category(1), int64(5), object(3)
memory usage: 210.4+ KB


In [179]:
# save dataset
df.to_csv(CLEANED_DATA_PATH, index=False)
logging.info(f"dataset saved in path: {CLEANED_DATA_PATH}")

2025-06-14 19:14:27,953 - INFO - dataset saved in path: ../data/processed/cleaned_aggregated_data.csv
