In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import seaborn as sns

# Download the list of stopwords if not already downloaded
# nltk.download('stopwords')

In [2]:
df = pd.read_csv('../data/processed/train_profiles.csv')
df

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group,country
0,62,"14440 Villanueva de C??rdoba, C??rdoba, Espa??a",white,technology,separated,"I'm going by right,639 I am a very good friend...",0,61-70,Spain
1,33,"Frankfurt, Germany",white,carer,single,"Am loving,caring,honest,passionate,faithful an...",1,31-40,Germany
2,51,"Piscataway, New Jersey, or New York, United St...",other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,1,51-60,United States
3,41,"Tijuana, B.C., M??xico",hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",0,41-50,Mexico
4,42,"Guayaquil, Ecuador",hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",0,41-50,Ecuador
...,...,...,...,...,...,...,...,...,...
4770,56,"Kansas city, Kansas, United States",white,other,divorced,well im a cool person who travels a lot i love...,1,51-60,United States
4771,26,"Bronx, NY, USA",hispanic,other,single,"I like them older, I want to have a good time.",0,21-30,United States
4772,43,"Colorado Springs, CO, USA",white,other,divorced,I am a good humored single father hard working...,0,41-50,United States
4773,57,"New York, NY, USA",white,sales,divorced,"Hello, I am dual citizen living in New York an...",0,51-60,United States


In [3]:
y = df['scam']
X = df.drop(columns=['scam','age','location'])
X

Unnamed: 0,ethnicity,occupation,status,description,age_group,country
0,white,technology,separated,"I'm going by right,639 I am a very good friend...",61-70,Spain
1,white,carer,single,"Am loving,caring,honest,passionate,faithful an...",31-40,Germany
2,other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,51-60,United States
3,hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",41-50,Mexico
4,hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",41-50,Ecuador
...,...,...,...,...,...,...
4770,white,other,divorced,well im a cool person who travels a lot i love...,51-60,United States
4771,hispanic,other,single,"I like them older, I want to have a good time.",21-30,United States
4772,white,other,divorced,I am a good humored single father hard working...,41-50,United States
4773,white,sales,divorced,"Hello, I am dual citizen living in New York an...",51-60,United States


# Predict using categorical features only

In [4]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [5]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # encode remaining variables with one-hot encoding
    encoder = OneHotEncoder(handle_unknown="ignore")
    X_train_ohe = encoder.fit_transform(X_train.drop(["description"], axis=1))
    X_test_ohe = encoder.transform(X_val.drop(["description"], axis=1))
    
    # instantiate the model
    mnb = MultinomialNB()

    # fit the model
    mnb.fit(np.asarray(X_train_ohe.todense()), y_train)

    # make a prediction
    y_pred = mnb.predict(np.asarray(X_test_ohe.todense()))

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,10.0,10.0,10.0,10.0
mean,0.889843,0.853837,0.939604,0.894507
std,0.013184,0.021411,0.010982,0.012549
min,0.861635,0.8157,0.926087,0.878676
25%,0.883308,0.842518,0.929573,0.88364
50%,0.890167,0.8569,0.938776,0.891858
75%,0.899895,0.868951,0.950342,0.905314
max,0.907757,0.885057,0.953975,0.913043


## Get feature importance

In [6]:
# Get the feature log probabilities from the trained Naive Bayes model
feature_log_probs = mnb.feature_log_prob_

# Get feature names from the vectorizer
feature_names = encoder.get_feature_names_out()

# Create a dictionary mapping feature names to log probabilities
feature_log_probs_dict = dict(zip(feature_names, feature_log_probs[1]))  # index 1 for positive class

# Sort feature log probabilities by value
sorted_features = sorted(feature_log_probs_dict.items(), key=lambda x: x[1], reverse=True)

# Print the top N features with highest log probabilities
top_n = 10
for feature, log_prob in sorted_features[:top_n]:
    print(f"Feature: {feature}, Log Probability: {log_prob}")

Feature: ethnicity_white, Log Probability: -1.9935001903200522
Feature: country_United States, Log Probability: -2.003645694996319
Feature: status_single, Log Probability: -2.259437452150575
Feature: age_group_51-60, Log Probability: -2.790065703212745
Feature: status_widowed, Log Probability: -2.811182626653668
Feature: age_group_31-40, Log Probability: -3.0320422028600467
Feature: age_group_41-50, Log Probability: -3.0807373145128922
Feature: occupation_other, Log Probability: -3.1970244635776925
Feature: occupation_military, Log Probability: -3.3784495613246106
Feature: age_group_21-30, Log Probability: -3.4280465024639826


# Predict using description only

In [7]:
stemmer = PorterStemmer()
 
def stem_words(text):
    # Split the input sentence into words
    word_tokens = text.split()
    # Stem each word individually
    stems = [stemmer.stem(word) for word in word_tokens]
    # Join the stemmed words back into a sentence
    stemmed_sentence = ' '.join(stems)
    return stemmed_sentence

def to_lower(value):
    # Process the value and return the result
    # For example, convert the value to lowercase
    return value.lower()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])


In [8]:
X['description']=X['description'].apply(to_lower)
X['description'] = X['description'].apply(remove_stopwords)
X['stem']=X['description'].apply(stem_words)

# Uni-gram


In [9]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    vectorizer = CountVectorizer(max_features=1000)

    X_train_bow = vectorizer.fit_transform(X_train["description"])
    X_test_bow = vectorizer.transform(X_val["description"])

    # training the model with CountVectorizer features
    mnb = MultinomialNB()

    # fitting the nb for bag of words
    mnb_bow = mnb.fit(X_train_bow, y_train)
    y_pred = mnb_bow.predict(X_test_bow)

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,10.0,10.0,10.0,10.0
mean,0.816132,0.798264,0.846432,0.820689
std,0.02188,0.043976,0.024933,0.021633
min,0.782427,0.716912,0.813043,0.78903
25%,0.803983,0.774244,0.832721,0.809484
50%,0.814658,0.794876,0.843002,0.820933
75%,0.829231,0.826613,0.847993,0.834167
max,0.855346,0.878151,0.897778,0.858316


In [10]:
# Get the feature log probabilities from the trained Naive Bayes model
feature_log_probs = mnb_bow.feature_log_prob_

# Get feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary mapping feature names to log probabilities
feature_log_probs_dict = dict(zip(feature_names, feature_log_probs[1]))  # Assuming binary classification, index 1 for positive class

# Sort feature log probabilities by value
sorted_features = sorted(feature_log_probs_dict.items(), key=lambda x: x[1], reverse=True)

# Print the top N features with highest log probabilities
top_n = 10
for feature, log_prob in sorted_features[:top_n]:
    print(f"Feature: {feature}, Log Probability: {log_prob}")


Feature: love, Log Probability: -3.5650699665393812
Feature: life, Log Probability: -3.9465070085286804
Feature: like, Log Probability: -3.984247336511527
Feature: man, Log Probability: -4.088622144844594
Feature: good, Log Probability: -4.270943701638549
Feature: woman, Log Probability: -4.310125006701345
Feature: person, Log Probability: -4.354111154885594
Feature: looking, Log Probability: -4.428548616333895
Feature: someone, Log Probability: -4.4733674379791415
Feature: honest, Log Probability: -4.5253593446970335


# Train on whole train dataset