### 1. Importing All the necessary libraries

In [171]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

### 2. Loading the dataset into pandas

In [172]:
df= pd.read_csv('spring_data.csv')

### 3. Dropping Unnecessary Columns

In [173]:
df.drop(columns={'If not who are restricted?','Remarks (source specific comment)','SN'},axis=1,inplace=True)

### 4. Formatting the data 

#### 4.1. Formatting Booleans

In [174]:
def convert_to_boolean(df, columns, true_value='yes', false_value='no', default=False):
    mapping = {
        str(true_value).strip().lower(): True,
        str(false_value).strip().lower(): False,
        '0': default
    }

    for col in columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.lower()
            .map(mapping)
            .fillna(default)
            .astype(bool)
        )

    return df

In [175]:
bool_cols = [
    'Is water source significant for cultural/religious purpose?',
    'Is this source accessible to everyone in community? (in term of use access)',
    'Any spring revival / pond protection measures taken?',
    'Are there water user groups for management?',
    'Is this source tapped?',
    'Is this source in use?',
    'After this source dried up , was there any activity for revival done in this source?'
]

In [176]:
df = convert_to_boolean(df, bool_cols)

  .fillna(default)
  .fillna(default)
  .fillna(default)
  .fillna(default)
  .fillna(default)
  .fillna(default)
  .fillna(default)


#### 4.2. Formatting Categorical Data using Label Encoding

In [177]:
df['Supplied to same Municipality or different?'].unique()

array([nan, 'same', '0', 'different', 'both'], dtype=object)

In [180]:
df['Supplied to same ward or different wards?'].unique()

array([nan, 'same', 'different', '0', 'both'], dtype=object)

In [181]:
def label_encode_columns(df, columns, mapping, default_label='unknown'):
    for column in columns:
        df[column] = (
            df[column]
            .astype(str)
            .str.strip()
            .str.lower()
            .replace('nan', default_label)  
        )
        
        df[column] = df[column].apply(lambda x: x if x in mapping else default_label)
        df[column] = df[column].map(mapping).astype(int)
    
    return df

In [182]:
columns_to_encode = [
    'Supplied to same Municipality or different?',
    'Supplied to same ward or different wards?'
]

shared_mapping = {
    'same': 1,
    'different': 0,
    '0': 0,
    'both': 2,
    'unknown': 3
}

df = label_encode_columns(df, columns_to_encode, shared_mapping)


In [183]:
df['Perception on source water quality change'].unique()

array(['no_change', nan, 'improved', 'Same', 'Degraded', 'degraded'],
      dtype=object)

In [184]:
df['Any impact due to 2015 earthquake?'].unique()

array([nan, 'Degraded', 'no_change', 'Improved', 'decreased', 'increased',
       'dried', '0', 'BCTS'], dtype=object)