In [56]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression


In [57]:
df = pd.read_csv("../data/processed/combined_profiles.csv")
df.head()

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group,country
0,45,"New York, United states",white,engineering,divorced,"I’m balanced: secure enough to be vulnerable, ...",1,41-50,United states
1,22,"Debry, United Kingdom",mixed,self-employed,single,easygoing young girl looking for a nice partner,1,21-30,United Kingdom
2,49,"Providence, Utah, United States",white,military,divorced,I enjoy a variety of things. I am a 49 years o...,1,41-50,United States
3,48,"Castrop-Rauxel, Germany, or New York, or Los A...",white,finance,widowed,I am an optimistic person who has different in...,1,41-50,United States
4,48,"Miami, Florida, United States",white,other,widowed,"am a gentle woman, i love going to church, wil...",1,41-50,United States


In [58]:
y = df['scam']
X = df.drop(columns=['scam', 'description', 'age', 'location'])

In [59]:
X

Unnamed: 0,ethnicity,occupation,status,age_group,country
0,white,engineering,divorced,41-50,United states
1,mixed,self-employed,single,21-30,United Kingdom
2,white,military,divorced,41-50,United States
3,white,finance,widowed,41-50,United States
4,white,other,widowed,41-50,United States
...,...,...,...,...,...
5970,white,military,single,61-70,United States
5971,mixed,construction,divorced,51-60,United States
5972,mixed,contractor,widowed,51-60,United Kingdom
5973,black,security,single,31-40,South Africa


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(X_train)

In [62]:
X_train_encoded = ohe.transform(X_train).toarray()
X_test_encoded = ohe.transform(X_test).toarray()

In [63]:
# fit a logistic regression model on X_train_encoded
model = LogisticRegression(max_iter=1000)
model.fit(X_train_encoded, y_train)

In [64]:
y_pred = model.predict(X_test_encoded)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       594
           1       0.97      0.96      0.96       601

    accuracy                           0.96      1195
   macro avg       0.96      0.96      0.96      1195
weighted avg       0.97      0.96      0.96      1195



In [65]:
# get the weights of the model
weights = model.coef_
weights = weights[0]


In [66]:
feature_importance = pd.DataFrame(zip(ohe.get_feature_names_out(), np.transpose(model.coef_)), columns=['features', 'coef']) 
feature_importance.sort_values(by='coef', ascending=False)[:20]

Unnamed: 0,features,coef
237,country_United States,[5.778727608398155]
235,country_United Kingdom,[3.989595047449851]
59,status_widowed,[2.9931182902675366]
245,country_United states,[2.6070010142396853]
40,occupation_military,[2.2964452596690013]
132,country_Ghana,[2.1031910191248984]
61,age_group_21-30,[2.0215458753088713]
211,country_Switzerland,[1.8339080922201767]
193,country_Russia,[1.7938985781671397]
79,country_Afghanistan,[1.5320189993031847]


In [71]:
y_pred_proba = model.predict_proba(X_test_encoded)

In [72]:
y_pred_proba

array([[9.98816183e-01, 1.18381732e-03],
       [9.99683142e-01, 3.16858381e-04],
       [9.99725742e-01, 2.74257935e-04],
       ...,
       [1.37251976e-02, 9.86274802e-01],
       [9.98586094e-01, 1.41390560e-03],
       [9.35496996e-01, 6.45030040e-02]])

In [73]:
# calculate softmax for y_pred_proba
y_pred_proba_softmax = np.exp(y_pred_proba) / np.sum(np.exp(y_pred_proba), axis=1)[:, None]
y_pred_proba_softmax

array([[0.73059282, 0.26940718],
       [0.73093396, 0.26906604],
       [0.73095072, 0.26904928],
       ...,
       [0.2743726 , 0.7256274 ],
       [0.73050223, 0.26949777],
       [0.70495248, 0.29504752]])

In [74]:
# extract just the probability for class = 1
y_pred_proba_class_1 = y_pred_proba_softmax[:, 1]
y_pred_proba_class_1

array([0.26940718, 0.26906604, 0.26904928, ..., 0.7256274 , 0.26949777,
       0.29504752])