In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB

import re
import seaborn as sns


In [2]:
df = pd.read_csv('../data/processed/combined_profiles.csv')

In [3]:
y = df['scam']
X = df.drop(columns=['scam', 'description', 'age', 'location'])

In [4]:
X

Unnamed: 0,ethnicity,occupation,status,age_group,country
0,white,engineering,divorced,41-50,United states
1,mixed,self-employed,single,21-30,United Kingdom
2,white,military,divorced,41-50,United States
3,white,finance,widowed,41-50,United States
4,white,other,widowed,41-50,United States
...,...,...,...,...,...
5970,white,military,single,61-70,United States
5971,mixed,construction,divorced,51-60,United States
5972,mixed,contractor,widowed,51-60,United Kingdom
5973,black,security,single,31-40,South Africa


In [5]:
X.dtypes

ethnicity     object
occupation    object
status        object
age_group     object
country       object
dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(X_train)

In [8]:
X_train_encoded = ohe.transform(X_train).toarray()
X_test_encoded = ohe.transform(X_test).toarray()

In [9]:
mnb = MultinomialNB()
mnb.fit(X_train_encoded, y_train)
y_pred = mnb.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       594
           1       0.97      0.95      0.96       601

    accuracy                           0.96      1195
   macro avg       0.96      0.96      0.96      1195
weighted avg       0.96      0.96      0.96      1195



In [10]:
mnb.classes_

array([0, 1])

In [11]:
mnb.class_count_

array([2403., 2377.])

In [12]:
mnb.class_log_prior_
# log probability of each class (not scam, scam) 
# expected to be roughly equal as the counts are balanced

array([-0.68772259, -0.69860136])

In [13]:
log_prob_not_scam = np.transpose(mnb.feature_log_prob_)[:,0]
log_prob_scam = np.transpose(mnb.feature_log_prob_)[:,1]

In [14]:
log_probs = pd.DataFrame(zip(ohe.get_feature_names_out(), log_prob_not_scam, log_prob_scam), columns=['features', 'log_prob_not_scam', 'log_prob_scam']) 
log_probs
# feature log prob: [P(x_i | 0), P(x_i | y)]

Unnamed: 0,features,log_prob_not_scam,log_prob_scam
0,ethnicity_Asian,-9.416704,-8.307459
1,ethnicity_Hispanic,-9.416704,-8.712924
2,ethnicity_Middle Eastern,-9.416704,-7.460161
3,ethnicity_Mixed,-9.416704,-8.307459
4,ethnicity_Native American,-9.416704,-5.363020
...,...,...,...
272,country_or Syria,-9.416704,-8.712924
273,country_or United Kingdom,-9.416704,-8.712924
274,country_south Africa,-9.416704,-8.712924
275,country_united state of american,-9.416704,-8.712924


In [15]:
log_probs.sort_values(by=['log_prob_scam', 'log_prob_not_scam'], ascending=False)

Unnamed: 0,features,log_prob_not_scam,log_prob_scam
14,ethnicity_white,-2.695278,-2.001184
237,country_United States,-9.416704,-2.035211
58,status_single,-2.155479,-2.276774
59,status_widowed,-5.059995,-2.798071
64,age_group_51-60,-3.141942,-2.814398
...,...,...,...
213,country_Tajikistan,-8.723557,-9.406072
214,country_Tanzania,-8.723557,-9.406072
218,country_The Bahamas,-8.723557,-9.406072
221,country_Trinidad y Tobago,-8.723557,-9.406072


## Output probabilities instead

In [16]:
y_pred_proba = mnb.predict_proba(X_test_encoded)
y_pred_proba

array([[9.99894950e-01, 1.05049997e-04],
       [9.99957138e-01, 4.28618328e-05],
       [9.99996502e-01, 3.49761844e-06],
       ...,
       [2.95757118e-04, 9.99704243e-01],
       [9.99832172e-01, 1.67827963e-04],
       [9.93228020e-01, 6.77197974e-03]])

In [17]:
# calculate softmax for y_pred_proba
y_pred_proba_softmax = np.exp(y_pred_proba) / np.sum(np.exp(y_pred_proba), axis=1)[:, None]
y_pred_proba_softmax

array([[0.73101727, 0.26898273],
       [0.73104172, 0.26895828],
       [0.7310572 , 0.2689428 ],
       ...,
       [0.26905774, 0.73094226],
       [0.73099258, 0.26900742],
       [0.72838736, 0.27161264]])

In [18]:
# extract just the probability for class = 1
y_pred_proba_class_1 = y_pred_proba_softmax[:, 1]
y_pred_proba_class_1

array([0.26898273, 0.26895828, 0.2689428 , ..., 0.73094226, 0.26900742,
       0.27161264])