https://github.com/YouAITube

In [1]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Load dataset (using some data with both numerical and categorical features)
data = fetch_openml("adult", version=2, as_frame=True)
X = data['data']
y = (data['target'] == '>50K').astype(int)

display(X)
display(y)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


Unnamed: 0,class
0,0
1,0
2,1
3,1
4,0
...,...
48837,0
48838,1
48839,0
48840,0


In [2]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  int64   
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capital-gain    48842 non-null  int64   
 11  capital-loss    48842 non-null  int64   
 12  hours-per-week  48842 non-null  int64   
 13  native-country  47985 non-null  category
dtypes: category(8), int64(6)
memory usage: 2.6 MB


In [3]:
# Separate numerical and categorical features
num_features = X.select_dtypes(include=['float64',
                                        'int64']).columns.tolist()
cat_features = X.select_dtypes(include=['category',
                                        'object']).columns.tolist()

display(num_features)


['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [4]:
X[num_features]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,25,226802,7,0,0,40
1,38,89814,9,0,0,50
2,28,336951,12,0,0,40
3,44,160323,10,7688,0,40
4,18,103497,10,0,0,30
...,...,...,...,...,...,...
48837,27,257302,12,0,0,38
48838,40,154374,9,0,0,40
48839,58,151910,9,0,0,40
48840,22,201490,9,0,0,20


In [5]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X[num_features],
                                                    y,
                                                    test_size=0.2, random_state=42)


In [6]:
GNB = GaussianNB()

GNB.fit(X_train, y_train)

y_predict=GNB.predict(X_test)

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1

print('ACC: %.4f' % accuracy(y_predict,y_test))
print('F1 : %.4f' %  f1(y_predict,y_test, average = 'macro'))

print (classification_report(y_test, y_predict))

ACC: 0.7992
F1 : 0.6491
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      7479
           1       0.65      0.31      0.42      2290

    accuracy                           0.80      9769
   macro avg       0.73      0.63      0.65      9769
weighted avg       0.78      0.80      0.77      9769



what is the class balance (a priori probability)

In [8]:
GNB.class_prior_

array([0.75950145, 0.24049855])

probability answer

In [9]:
import pandas as pd
import seaborn as sns
pd.DataFrame(GNB.predict_proba(X_test)).style.background_gradient(cmap=sns.color_palette("vlag", as_cmap=True) ,
                                        vmin = 0,
                                        vmax = 1
                                        ).format(precision=4)

Unnamed: 0,0,1
0,0.9859,0.0141
1,0.9966,0.0034
2,0.0,1.0
3,0.9937,0.0063
4,0.9911,0.0089
5,0.9982,0.0018
6,0.9813,0.0187
7,0.0004,0.9996
8,0.992,0.008
9,0.9925,0.0075


count groupby agg mean std

In [12]:
pd.DataFrame(GNB.theta_, columns = X[num_features].columns)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,36.823561,189943.038347,9.593982,146.90986,54.770454,38.838051
1,44.302756,188675.924018,11.601256,3999.263488,198.124508,45.449824


In [13]:
pd.DataFrame(GNB.var_, columns = X[num_features].columns)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,210.262758,11303500000.0,17.06444,862359.5,98757.610587,165.656123
1,122.807428,10497300000.0,16.744136,215586500.0,358538.770892,135.151573
