In [1]:
!pip install category_encoders



In [2]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Load dataset (using some data with both numerical and categorical features)
data = fetch_openml("adult", version=2, as_frame=True)
X = data['data']
y = (data['target'] == '>50K').astype(int)

display(X)
display(y)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


Unnamed: 0,class
0,0
1,0
2,1
3,1
4,0
...,...
48837,0
48838,1
48839,0
48840,0


In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
dtypes: category(8), int64(6)
memory usage: 2.6 MB


In [4]:
# Separate numerical and categorical features
num_features = X.select_dtypes(include=['float64',
                                        'int64']).columns.tolist()
cat_features = X.select_dtypes(include=['category',
                                        'object']).columns.tolist()

display(num_features)
print("---")
display(cat_features)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

---


['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [5]:
num_features

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [6]:
X[num_features]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,25,226802,7,0,0,40
1,38,89814,9,0,0,50
2,28,336951,12,0,0,40
3,44,160323,10,7688,0,40
4,18,103497,10,0,0,30
...,...,...,...,...,...,...
48837,27,257302,12,0,0,38
48838,40,154374,9,0,0,40
48839,58,151910,9,0,0,40
48840,22,201490,9,0,0,20


In [7]:
X[cat_features]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States
4,,Some-college,Never-married,,Own-child,White,Female,United-States
...,...,...,...,...,...,...,...,...
48837,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
48838,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
48839,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
48840,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [8]:
for column in X[cat_features].columns:
  print(X[column].unique())

['Private', 'Local-gov', NaN, 'Self-emp-not-inc', 'Federal-gov', 'State-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked']
Categories (8, object): ['Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc',
                         'Self-emp-not-inc', 'State-gov', 'Without-pay']
['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th', ..., 'Assoc-voc', '9th', '12th', '1st-4th', 'Preschool']
Length: 16
Categories (16, object): ['10th', '11th', '12th', '1st-4th', ..., 'Masters', 'Preschool',
                          'Prof-school', 'Some-college']
['Never-married', 'Married-civ-spouse', 'Widowed', 'Divorced', 'Separated', 'Married-spouse-absent', 'Married-AF-spouse']
Categories (7, object): ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
                         'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed']
['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', NaN, 'Other-service', ..., 'Sales', 'Priv-house-serv', 'Transport-moving', 'Hand

In [9]:
# Further split categorical features into binary and non-binary
binary_cat_features = [col for col in cat_features if X[col].nunique() == 2]
non_binary_cat_features = [col for col in cat_features if X[col].nunique() > 2]

In [10]:
binary_cat_features

['sex']

In [11]:
non_binary_cat_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'native-country']

#numerical

In [12]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X[num_features],
                                                    y,
                                                    test_size=0.2, random_state=42)


In [13]:
GNB = GaussianNB()

GNB.fit(X_train, y_train)

y_predict=GNB.predict(X_test)

In [14]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1

print('ACC: %.4f' % accuracy(y_predict,y_test))
print('F1 : %.4f' %  f1(y_predict,y_test, average = 'macro'))

print (classification_report(y_test, y_predict))

ACC: 0.7992
F1 : 0.6491
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      7479
           1       0.65      0.31      0.42      2290

    accuracy                           0.80      9769
   macro avg       0.73      0.63      0.65      9769
weighted avg       0.78      0.80      0.77      9769



#categorical

In [21]:
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder

# 2. Binary categorical features (OneHotEncoding)
binary_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('onehot', OneHotEncoder())   # One-hot encoding
])

# 3. Non-binary categorical features (TargetEncoding)
non_binary_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('target_enc', TargetEncoder())                        # Target encoding
])

# Combine all transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('binary_cat', binary_cat_transformer, binary_cat_features),
        ('non_binary_cat', non_binary_cat_transformer, non_binary_cat_features)
    ])

# Create final pipeline with Logistic Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', MinMaxScaler()),
    ('classifier',CategoricalNB())
])

In [22]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X[cat_features],
                                                    y,
                                                    test_size=0.2, random_state=42)
# Fit the pipeline
pipeline.fit(X_train, y_train)


In [23]:
y_pred = pipeline.predict(X_test)
print('ACC: %.4f' % accuracy(y_pred,y_test))
print('F1 : %.4f' %  f1(y_pred,y_test, average = 'macro'))

print (classification_report(y_test, y_pred))

ACC: 0.7228
F1 : 0.6798
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      7479
           1       0.45      0.76      0.56      2290

    accuracy                           0.72      9769
   macro avg       0.68      0.74      0.68      9769
weighted avg       0.80      0.72      0.74      9769



#mixed - numerical and categorical

In [24]:
!pip install mixed-naive-bayes



In [49]:
from mixed_naive_bayes import MixedNB
# Get the column indices for categorical features
cat_feature_indices = [X.columns.get_loc(col) for col in cat_features]
display(cat_feature_indices)



[]

In [38]:
import pandas as pd
binary_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('onehot', OneHotEncoder(drop='first') )  # One-hot encoding
])
preprocessor = ColumnTransformer(
    transformers=[
        ('binary_cat', binary_cat_transformer, binary_cat_features),
        ('non_binary_cat', non_binary_cat_transformer, non_binary_cat_features)
    ],
    remainder="passthrough"  # Keeps numerical features as they are
)

# Create pipeline and apply MinMaxScaler after transformations
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

# Fit and transform the data
X_transformed = pipeline.fit_transform(X, y)

# Retrieve transformed column names for final DataFrame
binary_cat_names = pipeline.named_steps['preprocessor'].transformers_[0][1].named_steps['onehot'].get_feature_names_out(binary_cat_features)
non_binary_cat_names = non_binary_cat_features
num_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
columns = list(binary_cat_names) + list(non_binary_cat_names) + num_features

# Return DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=columns)
display(X_transformed_df.head())

Unnamed: 0,sex_Male,workclass,education,marital-status,occupation,relationship,race,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,1.0,0.208473,0.050773,0.04548,0.123097,0.014642,0.120811,0.244221,25.0,226802.0,7.0,0.0,0.0,40.0
1,1.0,0.208473,0.158578,0.446133,0.116107,0.448671,0.253987,0.244221,38.0,89814.0,9.0,0.0,0.0,50.0
2,1.0,0.295599,0.257964,0.446133,0.313327,0.448671,0.253987,0.244221,28.0,336951.0,12.0,0.0,0.0,40.0
3,1.0,0.208473,0.189649,0.446133,0.123097,0.448671,0.120811,0.244221,44.0,160323.0,10.0,7688.0,0.0,40.0
4,0.0,0.208473,0.189649,0.04548,0.339494,0.014642,0.253987,0.244221,18.0,103497.0,10.0,0.0,0.0,30.0


In [53]:
X_transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex_Male        48842 non-null  float64
 1   workclass       48842 non-null  float64
 2   education       48842 non-null  float64
 3   marital-status  48842 non-null  float64
 4   occupation      48842 non-null  float64
 5   relationship    48842 non-null  float64
 6   race            48842 non-null  float64
 7   native-country  48842 non-null  float64
 8   age             48842 non-null  float64
 9   fnlwgt          48842 non-null  float64
 10  education-num   48842 non-null  float64
 11  capital-gain    48842 non-null  float64
 12  capital-loss    48842 non-null  float64
 13  hours-per-week  48842 non-null  float64
dtypes: float64(14)
memory usage: 5.2 MB


In [55]:
X_transformed_encoded = X_transformed_df.copy()
# Select the first 8 column names from X_transformed_df as cat_features
cat_features = X_transformed_df.columns[:8].tolist()
display(cat_features)

['sex_Male',
 'workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'native-country']

In [56]:
from sklearn.preprocessing import LabelEncoder
from mixed_naive_bayes import MixedNB
import numpy as np

# Ensure all categorical features are encoded as integers
label_encoders = {}

# Encode categorical columns (convert each categorical feature to integers)
for col in cat_features:
    le = LabelEncoder()
    X_transformed_encoded[col] = le.fit_transform(X_transformed_encoded[col])
    label_encoders[col] = le  # Save encoders for possible inverse transforms

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed_encoded, y, test_size=0.2, random_state=42
)

# Define categorical feature indices
cat_feature_indices = [X_transformed_encoded.columns.get_loc(col) for col in cat_features]

# Initialize and fit the model
model = MixedNB(categorical_features=cat_feature_indices)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Print performance metrics
print('ACC: %.4f' % accuracy(y_test, y_pred))
print('F1 : %.4f' % f1(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred))

ACC: 0.8279
F1 : 0.7245
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      7479
           1       0.70      0.46      0.56      2290

    accuracy                           0.83      9769
   macro avg       0.78      0.70      0.72      9769
weighted avg       0.82      0.83      0.81      9769

