In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.regularizers import l2

In [4]:
# Load the data

df = pd.read_csv('Literacy Classifier Final Data.csv')

In [5]:
print(df.columns)

Index(['Social Group', 'Rural/Urban', 'State', 'Gender', 'Age',
       'Internet Access', 'Computer Access', 'Marital Status', 'Literacy'],
      dtype='object')


In [6]:
# Feature Engineering 

df['Digital Access'] = df['Internet Access'] + df['Computer Access']

# Create Age Brackets
bins = [5, 18, 35, 60, 118]
labels = ['<18', '18-35', '35-60', '>60']
df['Age Bracket'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

state_to_region = {
    1: 'North India',      # Jammu & Kashmir
    2: 'North India',      # Himachal Pradesh
    3: 'North India',      # Punjab
    4: 'North India',      # Chandigarh
    5: 'North India',      # Uttarakhand (Uttaranchal)
    6: 'North India',      # Haryana
    7: 'North India',      # Delhi
    8: 'North India',      # Rajasthan
    9: 'North India',      # Uttar Pradesh
    10: 'East India',      # Bihar
    11: 'Northeast India', # Sikkim
    12: 'Northeast India', # Arunachal Pradesh
    13: 'Northeast India', # Nagaland
    14: 'Northeast India', # Manipur
    15: 'Northeast India', # Mizoram
    16: 'Northeast India', # Tripura
    17: 'Northeast India', # Meghalaya
    18: 'Northeast India', # Assam
    19: 'East India',      # West Bengal
    20: 'East India',      # Jharkhand
    21: 'East India',      # Odisha
    22: 'Central India',   # Chhattisgarh
    23: 'Central India',   # Madhya Pradesh
    24: 'West India',      # Gujarat
    25: 'Union Territories', # Daman & Diu
    26: 'Union Territories', # Dadra and Nagar Haveli
    27: 'West India',      # Maharashtra
    28: 'South India',     # Andhra Pradesh
    29: 'South India',     # Karnataka
    30: 'West India',      # Goa
    31: 'Union Territories', # Lakshadweep
    32: 'South India',     # Kerala
    33: 'South India',     # Tamil Nadu
    34: 'South India',     # Pondicherry
    35: 'Union Territories', # Andaman and Nicobar Islands
    36: 'South India'      # Telangana
}
df['Region'] = df['State'].map(state_to_region)
df[['State', 'Region']].drop_duplicates()

# Drop redundant features after feature engineering
df.drop(['Internet Access', 'Computer Access', 'Age', 'State'], axis=1, inplace=True)

In [7]:
print(df.head())     # Verifying the first few rows of the data

   Social Group  Rural/Urban  Gender  Marital Status  Literacy  \
0             9            1       1               2         1   
1             9            1       2               2         1   
2             9            1       1               2         1   
3             9            1       2               2         1   
4             9            1       2               1         1   

   Digital Access Age Bracket       Region  
0               0         >60  North India  
1               0         >60  North India  
2               0       35-60  North India  
3               0       35-60  North India  
4               0         <18  North India  


In [8]:
print(df.info()) # Looking at info of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480938 entries, 0 to 480937
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   Social Group    480938 non-null  int64   
 1   Rural/Urban     480938 non-null  int64   
 2   Gender          480938 non-null  int64   
 3   Marital Status  480938 non-null  int64   
 4   Literacy        480938 non-null  int64   
 5   Digital Access  480938 non-null  int64   
 6   Age Bracket     480938 non-null  category
 7   Region          480938 non-null  object  
dtypes: category(1), int64(6), object(1)
memory usage: 26.1+ MB
None


In [12]:
X = df[['Social Group', 'Rural/Urban', 'Gender', 'Marital Status', 'Digital Access', 'Age Bracket', 'Region']]
y = df['Literacy']                
# Splitting the data into the feature and targe sets

In [13]:
# We first split off the training set (60% of the data)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=1)

# Then we split the remaining data into the cross validation and test set
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1)

del X_temp, y_temp

# So, X_cv is 20% of the original data, X_train is 60%, X_test is 20%, and vice versa for y

In [14]:
categorical_cols = [['Social Group', 'Rural/Urban', 'Gender', 'Marital Status', 'Digital Access', 'Age Bracket', 'Region']]
numerical_columns = []

# Our only columns are categorical, so we don't need to scale any numerical columns

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# List all categorical columns
categorical_columns = X_train.columns.tolist()  # Automatically fetch all column names

# Create the ColumnTransformer
ct = ColumnTransformer([
    ('onehot', OneHotEncoder(), categorical_columns)
])

In [13]:
# First we fit on the training data
X_train_enc = ct.fit_transform(X_train)
 
#Then we also transform the cross validation and test sets
X_cv_enc = ct.transform(X_cv)
X_test_enc = ct.transform(X_test)

encoded_column_names = ct.get_feature_names_out()

# Print the column names
print("Encoded Columns:")
print(encoded_column_names)


Encoded Columns:
['onehot__Social Group_1' 'onehot__Social Group_2'
 'onehot__Social Group_3' 'onehot__Social Group_9' 'onehot__Rural/Urban_1'
 'onehot__Rural/Urban_2' 'onehot__Gender_1' 'onehot__Gender_2'
 'onehot__Marital Status_1' 'onehot__Marital Status_2'
 'onehot__Marital Status_3' 'onehot__Digital Access_0'
 'onehot__Digital Access_1' 'onehot__Digital Access_2'
 'onehot__Age Bracket_18-35' 'onehot__Age Bracket_35-60'
 'onehot__Age Bracket_<18' 'onehot__Age Bracket_>60'
 'onehot__Region_Central India' 'onehot__Region_East India'
 'onehot__Region_North India' 'onehot__Region_Northeast India'
 'onehot__Region_South India' 'onehot__Region_Union Territories'
 'onehot__Region_West India']


In [14]:
smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X_train_enc, y_train)
# Create synthentic samples for the minority class (Illiterate)

In [42]:
y_train = np.array(y_train)
# Convert to numpy array for compatibility with class weight calculation

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
# Incentivize not misclassifying the minority class

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_enc.shape[1],), kernel_regularizer=l2(0.01)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(1, activation='sigmoid')
])
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),               
    loss='binary_crossentropy',
    metrics=['accuracy']            
)

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5,  # Stop if val_loss doesn't improve for 5 epochs
    restore_best_weights=True
)

history = model.fit(
    X_resampled, y_resampled, 
    epochs=15,
    batch_size=32,
    class_weight = class_weights_dict,
    validation_data=(X_cv_enc, y_cv),
    callbacks=[early_stopping]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 384us/step - accuracy: 0.6603 - loss: 0.9056 - val_accuracy: 0.4889 - val_loss: 0.9558
Epoch 2/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 374us/step - accuracy: 0.6883 - loss: 0.5480 - val_accuracy: 0.4983 - val_loss: 0.9321
Epoch 3/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 382us/step - accuracy: 0.6886 - loss: 0.5401 - val_accuracy: 0.5043 - val_loss: 0.9395
Epoch 4/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 381us/step - accuracy: 0.6887 - loss: 0.5356 - val_accuracy: 0.4920 - val_loss: 0.9420
Epoch 5/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 371us/step - accuracy: 0.6879 - loss: 0.5327 - val_accuracy: 0.4836 - val_loss: 0.9752
Epoch 6/15
[1m14912/14912[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 370us/step - accuracy: 0.6896 - loss: 0.5310 - val_accuracy: 0.5028 - val_loss:

In [47]:
y_test_probs = model.predict(X_test_enc)
y_test_pred = (y_test_probs > 0.30).astype(int) # Threshold of 0.30 maximizes F1 score
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Report:\n", classification_report(y_test, y_test_pred))

[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312us/step
Confusion Matrix:
 [[ 7538   791]
 [15411 24354]]
Report:
               precision    recall  f1-score   support

           0       0.33      0.91      0.48      8329
           1       0.97      0.61      0.75     39765

    accuracy                           0.66     48094
   macro avg       0.65      0.76      0.62     48094
weighted avg       0.86      0.66      0.70     48094



In [69]:
# Testing out the model while also engineeringg the input

model_columns = [
    'onehot__Social Group_1', 'onehot__Social Group_2',
    'onehot__Social Group_3', 'onehot__Social Group_9', 'onehot__Rural/Urban_1',
    'onehot__Rural/Urban_2', 'onehot__Gender_1', 'onehot__Gender_2',
    'onehot__Marital Status_1', 'onehot__Marital Status_2',
    'onehot__Marital Status_3', 'onehot__Digital Access_0',
    'onehot__Digital Access_1', 'onehot__Digital Access_2',
    'onehot__Age Bracket_18-35', 'onehot__Age Bracket_35-60',
    'onehot__Age Bracket_<18', 'onehot__Age Bracket_>60',
    'onehot__Region_Central India', 'onehot__Region_East India',
    'onehot__Region_North India', 'onehot__Region_Northeast India',
    'onehot__Region_South India', 'onehot__Region_Union Territories',
    'onehot__Region_West India'
]

sample_input = {
  "social_group": "Scheduled Tribes",
  "rural_urban": "Rural",
  "state": "Uttar Pradesh",
  "gender": "Female",
  "age": 60,
  "internet_access": "No",
  "computer_access": "No",
  "marital_status": "Widowed"
}

def preprocess_input(data: dict):

    # Mapping for state to region
    state_to_region = {
        "Jammu & Kashmir": "North India",
        "Himachal Pradesh": "North India",
        "Punjab": "North India",
        "Chandigarh": "North India",
        "Uttarakhand": "North India",
        "Haryana": "North India",
        "Delhi": "North India",
        "Rajasthan": "North India",
        "Uttar Pradesh": "North India",
        "Bihar": "East India",
        "Sikkim": "Northeast India",
        "Arunachal Pradesh": "Northeast India",
        "Nagaland": "Northeast India",
        "Manipur": "Northeast India",
        "Mizoram": "Northeast India",
        "Tripura": "Northeast India",
        "Meghalaya": "Northeast India",
        "Assam": "Northeast India",
        "West Bengal": "East India",
        "Jharkhand": "East India",
        "Odisha": "East India",
        "Chhattisgarh": "Central India",
        "Madhya Pradesh": "Central India",
        "Gujarat": "West India",
        "Daman & Diu": "Union Territories",
        "Dadara and Nagar Haveli": "Union Territories",
        "Maharashtra": "West India",
        "Andhra Pradesh": "South India",
        "Karnataka": "South India",
        "Goa": "West India",
        "Lakshadweep": "Union Territories",
        "Kerala": "South India",
        "Tamil Nadu": "South India",
        "Pondicherry": "South India",
        "Andaman and Nicobar Islands": "Union Territories",
        "Telangana": "South India",
    }

    social_group_mapping = {
        "Scheduled Tribes": 1,
        "Scheduled Castes": 2,
        "Other Backward Classes": 3,
        "Others": 9
    }
    marital_status_mapping = {
        "Single": 1,
        "Married": 2,
        "Widowed": 3
    }
    
    rural_urban_mapping = {
        "Rural": 1,
        "Urban": 2
    }

    gender_mapping = {
        "Male": 1,
        "Female": 2
    }
    
    def bin_age(age):
        if age < 18:
            return '<18'
        elif 18 <= age < 35:
            return '18-35'
        elif 35 <= age < 60:
            return '35-60'
        else:
            return '>60'

    digital_access = 0
    if data.get("internet_access") == "Yes":
        digital_access += 1
    if data.get("computer_access") == "Yes":
        digital_access += 1

    region = state_to_region.get(data.get("state", ""), "Unknown")
    age_bracket = bin_age(data.get("age", 0))
    social_group = social_group_mapping.get(data.get("social_group", ""))
    marital_status = marital_status_mapping.get(data.get("marital_status", ""))
    rural_urban = rural_urban_mapping.get(data.get("rural_urban", ""), "Unknown")
    gender = gender_mapping.get(data.get("gender", ""), "Unknown")

    categorical_features = {
        "social_group": social_group,
        "rural_urban": rural_urban,
        "gender": gender,
        "marital_status": marital_status,
        "digital_access": digital_access, 
        "age_bracket": age_bracket,
        "region": region
    }
    
    # Create a DataFrame for one-hot encoding
    df = pd.DataFrame([categorical_features])

    # Apply one-hot encoding with consistent prefixes
    df_encoded = pd.get_dummies(
        df,
        columns=["social_group", "rural_urban", "gender", "marital_status", "digital_access", "age_bracket", "region"],
        prefix=["onehot__Social Group", "onehot__Rural/Urban", "onehot__Gender", "onehot__Marital Status", "onehot__Digital Access", "onehot__Age Bracket", "onehot__Region"]
    )

    # Align columns with the model's expected input columns
    df_encoded = df_encoded.reindex(columns=model_columns, fill_value=0)
    
    return df_encoded.values.astype(np.float32)
    

# Step 1: Preprocess the input
try:
    processed_sample_input = preprocess_input(sample_input)
    print(f"Processed input: {processed_sample_input}")
    print(f"Processed input shape: {processed_sample_input.shape}")
except Exception as e:
    print(f"Error during preprocessing: {e}")

# Step 2: Test the model prediction
try:
    # Ensure the model is loaded
    if 'model' not in globals():
        model = tf.keras.models.load_model('literacy_classifier.keras')

    # Get the raw probability
    prediction = model.predict(processed_sample_input)
    print(f"Raw prediction: {prediction}")

    # Extract the probability if necessary
    raw_prediction = float(prediction[0]) if prediction.ndim == 1 else float(prediction[0][0])

    # Step 3: Apply the threshold
    threshold = 0.30  # Using the same threshold as we did in the evlaution
    binary_prediction = 1 if raw_prediction > threshold else 0
    literacy_status = "Literate" if binary_prediction == 1 else "Illiterate"

    # Step 4: Display the results
    print(f"Probability: {raw_prediction}")
    print(f"Binary Prediction (Class): {binary_prediction}")
    print(f"Literacy Status: {literacy_status}")

except Exception as e:
    print(f"Error during prediction: {e}")

Processed input: [[1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
  0.]]
Processed input shape: (1, 25)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Raw prediction: [[0.00549285]]
Probability: 0.00549284927546978
Binary Prediction (Class): 0
Literacy Status: Illiterate


In [72]:
model.save('literacy_classifier.keras')

# Saved the model to a .keras file