In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import seaborn as sns

# Download the list of stopwords if not already downloaded
# nltk.download('stopwords')

In [2]:
df = pd.read_csv('../data/processed/train_profiles.csv')
df

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group,country
0,62,"14440 Villanueva de C??rdoba, C??rdoba, Espa??a",white,technology,separated,"I'm going by right,639 I am a very good friend...",0,61-70,Spain
1,33,"Frankfurt, Germany",white,carer,single,"Am loving,caring,honest,passionate,faithful an...",1,31-40,Germany
2,51,"Piscataway, New Jersey, or New York, United St...",other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,1,51-60,United States
3,41,"Tijuana, B.C., M??xico",hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",0,41-50,Mexico
4,42,"Guayaquil, Ecuador",hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",0,41-50,Ecuador
...,...,...,...,...,...,...,...,...,...
4770,56,"Kansas city, Kansas, United States",white,other,divorced,well im a cool person who travels a lot i love...,1,51-60,United States
4771,26,"Bronx, NY, USA",hispanic,other,single,"I like them older, I want to have a good time.",0,21-30,United States
4772,43,"Colorado Springs, CO, USA",white,other,divorced,I am a good humored single father hard working...,0,41-50,United States
4773,57,"New York, NY, USA",white,sales,divorced,"Hello, I am dual citizen living in New York an...",0,51-60,United States


In [3]:
y = df['scam']
X = df.drop(columns=['scam','age','location'])
X

Unnamed: 0,ethnicity,occupation,status,description,age_group,country
0,white,technology,separated,"I'm going by right,639 I am a very good friend...",61-70,Spain
1,white,carer,single,"Am loving,caring,honest,passionate,faithful an...",31-40,Germany
2,other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,51-60,United States
3,hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",41-50,Mexico
4,hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",41-50,Ecuador
...,...,...,...,...,...,...
4770,white,other,divorced,well im a cool person who travels a lot i love...,51-60,United States
4771,hispanic,other,single,"I like them older, I want to have a good time.",21-30,United States
4772,white,other,divorced,I am a good humored single father hard working...,41-50,United States
4773,white,sales,divorced,"Hello, I am dual citizen living in New York an...",51-60,United States


# Predict using categorical features only

In [4]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [5]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # encode remaining variables with one-hot encoding
    encoder = OneHotEncoder(handle_unknown="ignore")
    X_train_ohe = encoder.fit_transform(X_train.drop(["description"], axis=1))
    X_val_ohe = encoder.transform(X_val.drop(["description"], axis=1))
    
    # instantiate the model
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train_ohe, y_train)

    # fit the model
    SVM.fit(np.asarray(X_train_ohe.todense()), y_train)

    # make a prediction
    y_pred = SVM.predict(np.asarray(X_val_ohe.todense()))

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,10.0,10.0,10.0,10.0
mean,0.893606,0.864085,0.933595,0.897289
std,0.013659,0.021899,0.014133,0.012069
min,0.872117,0.838028,0.915254,0.876268
25%,0.886852,0.853506,0.923943,0.889451
50%,0.892148,0.85655,0.929751,0.897363
75%,0.904812,0.87435,0.944829,0.904413
max,0.912134,0.902041,0.959459,0.913828


## Get feature importance

In [6]:
# Get the coefficients assigned to each feature
coefficients = SVM.coef_

# Get the feature names from the encoder
feature_names = encoder.get_feature_names_out()

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients[0]))

# Sort the feature coefficients by their absolute values
sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

# Print or analyze the top features
for feature, coefficient in sorted_features[:10]:
    print(f"Feature: {feature}, Coefficient: {coefficient}")


Feature: age_group_71-80, Coefficient: -2.94161057967667
Feature: country_Ghana, Coefficient: 2.765322423922937
Feature: status_widowed, Coefficient: 2.5789251252714394
Feature: occupation_military, Coefficient: 2.363227997207985
Feature: ethnicity_hispanic, Coefficient: -2.0812770845003854
Feature: country_Colombia, Coefficient: -2.0
Feature: occupation_manufacturing, Coefficient: -1.765226526288683
Feature: occupation_repair, Coefficient: -1.5705319577490169
Feature: occupation_unemployed, Coefficient: -1.5701178007335892
Feature: country_Russia, Coefficient: 1.5104621139620638


# Predict using description only

In [7]:
stemmer = PorterStemmer()
 
def stem_words(text):
    # Split the input sentence into words
    word_tokens = text.split()
    # Stem each word individually
    stems = [stemmer.stem(word) for word in word_tokens]
    # Join the stemmed words back into a sentence
    stemmed_sentence = ' '.join(stems)
    return stemmed_sentence

def to_lower(value):
    # Process the value and return the result
    # For example, convert the value to lowercase
    return value.lower()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])


In [8]:
X['description']=X['description'].apply(to_lower)
X['description'] = X['description'].apply(remove_stopwords)
X['stem']=X['description'].apply(stem_words)

## Uni-gram


In [9]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    vectorizer = CountVectorizer(max_features=1000)

    X_train_bow = vectorizer.fit_transform(X_train["description"])
    X_val_bow = vectorizer.transform(X_val["description"])

    # instantiate the model
    SVM_bow = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM_bow.fit(X_train_bow, y_train)

    # fit the model
    SVM_bow.fit(np.asarray(X_train_bow.todense()), y_train)

    # make a prediction
    y_pred = SVM_bow.predict(np.asarray(X_val_bow.todense()))

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,10.0,10.0,10.0,10.0
mean,0.846704,0.868511,0.816326,0.840895
std,0.01253,0.027408,0.032194,0.01557
min,0.82636,0.822511,0.769874,0.815965
25%,0.839352,0.852291,0.799359,0.830285
50%,0.848168,0.86578,0.810173,0.84184
75%,0.856695,0.89114,0.848167,0.851064
max,0.863732,0.90625,0.856522,0.865424


In [10]:
# Get the coefficients assigned to each feature
coefficients = SVM_bow.coef_

# Get the feature names from the encoder
feature_names = encoder.get_feature_names_out()

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients[0]))

# Sort the feature coefficients by their absolute values
sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

# Print or analyze the top features
for feature, coefficient in sorted_features[:10]:
    print(f"Feature: {feature}, Coefficient: {coefficient}")

Feature: country_Mozambique, Coefficient: 2.3812593429023217
Feature: country_Sri Lanka, Coefficient: 1.5673686575579797
Feature: occupation_fashion, Coefficient: -1.5009315599403394
Feature: age_group_31-40, Coefficient: 1.466938135229456
Feature: country_Algeria, Coefficient: 1.4387557164529259
Feature: country_Marruecos, Coefficient: -1.416366361884649
Feature: country_Honduras, Coefficient: 1.4112832123243706
Feature: country_Israel, Coefficient: -1.246406804953082
Feature: ethnicity_other, Coefficient: 1.2404747354111807
Feature: occupation_manufacturing, Coefficient: -1.2385716617379918
