In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import seaborn as sns

# Download the list of stopwords if not already downloaded
# nltk.download('stopwords')

In [2]:
df = pd.read_csv('../data/processed/train_profiles.csv')
df

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group,country
0,62,"14440 Villanueva de C??rdoba, C??rdoba, Espa??a",white,technology,separated,"I'm going by right,639 I am a very good friend...",0,61-70,Spain
1,33,"Frankfurt, Germany",white,carer,single,"Am loving,caring,honest,passionate,faithful an...",1,31-40,Germany
2,51,"Piscataway, New Jersey, or New York, United St...",other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,1,51-60,United States
3,41,"Tijuana, B.C., M??xico",hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",0,41-50,Mexico
4,42,"Guayaquil, Ecuador",hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",0,41-50,Ecuador
...,...,...,...,...,...,...,...,...,...
4770,56,"Kansas city, Kansas, United States",white,other,divorced,well im a cool person who travels a lot i love...,1,51-60,United States
4771,26,"Bronx, NY, USA",hispanic,other,single,"I like them older, I want to have a good time.",0,21-30,United States
4772,43,"Colorado Springs, CO, USA",white,other,divorced,I am a good humored single father hard working...,0,41-50,United States
4773,57,"New York, NY, USA",white,sales,divorced,"Hello, I am dual citizen living in New York an...",0,51-60,United States


In [3]:
y = df['scam']
X = df.drop(columns=['scam','age','location'])
X

Unnamed: 0,ethnicity,occupation,status,description,age_group,country
0,white,technology,separated,"I'm going by right,639 I am a very good friend...",61-70,Spain
1,white,carer,single,"Am loving,caring,honest,passionate,faithful an...",31-40,Germany
2,other,engineering,divorced,I?€?m ready to explore the next chapter in my ...,51-60,United States
3,hispanic,other,single,"40 good vibes, very attentive, respectful, a l...",41-50,Mexico
4,hispanic,technology,single,"I am a normal, happy, fun person. I don't drin...",41-50,Ecuador
...,...,...,...,...,...,...
4770,white,other,divorced,well im a cool person who travels a lot i love...,51-60,United States
4771,hispanic,other,single,"I like them older, I want to have a good time.",21-30,United States
4772,white,other,divorced,I am a good humored single father hard working...,41-50,United States
4773,white,sales,divorced,"Hello, I am dual citizen living in New York an...",51-60,United States


# Predict using categorical features only

In [4]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [5]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # encode remaining variables with one-hot encoding
    encoder = OneHotEncoder(handle_unknown="ignore")
    X_train_ohe = encoder.fit_transform(X_train.drop(["description"], axis=1))
    X_val_ohe = encoder.transform(X_val.drop(["description"], axis=1))
    
    classifier = RandomForestClassifier(n_estimators=400, criterion="entropy", random_state=42)

    classifier.fit(X_train_ohe, y_train)

    y_pred = classifier.predict(X_val_ohe)

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

## Get feature importance

In [None]:
# Get feature importances
importances = classifier.feature_importances_

# Get the indices of features sorted by importance
sorted_indices = np.argsort(importances)[::-1]

# Get the feature names
feature_names = encoder.get_feature_names_out()

# Print the most important words
top_k = 10  # You can change this to get more or fewer top words
print(f"Top {top_k} important words:")
for i in range(top_k):
    idx = sorted_indices[i]
    print(f"Feature {i+1}: {feature_names[idx]} (Importance: {importances[idx]:.4f})")

Top 10 important words:
Feature 1: ethnicity_hispanic (Importance: 0.1253)
Feature 2: country_United States (Importance: 0.0857)
Feature 3: status_widowed (Importance: 0.0619)
Feature 4: ethnicity_white (Importance: 0.0448)
Feature 5: occupation_military (Importance: 0.0419)
Feature 6: country_Colombia (Importance: 0.0292)
Feature 7: status_separated (Importance: 0.0243)
Feature 8: age_group_21-30 (Importance: 0.0228)
Feature 9: age_group_61-70 (Importance: 0.0223)
Feature 10: occupation_engineering (Importance: 0.0185)


# Predict using description only

In [None]:
stemmer = PorterStemmer()
 
def stem_words(text):
    # Split the input sentence into words
    word_tokens = text.split()
    # Stem each word individually
    stems = [stemmer.stem(word) for word in word_tokens]
    # Join the stemmed words back into a sentence
    stemmed_sentence = ' '.join(stems)
    return stemmed_sentence

def to_lower(value):
    # Process the value and return the result
    # For example, convert the value to lowercase
    return value.lower()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])


In [None]:
X['description']=X['description'].apply(to_lower)
X['description'] = X['description'].apply(remove_stopwords)
X['stem']=X['description'].apply(stem_words)

## Uni-gram


In [None]:
report = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'])
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    vectorizer = CountVectorizer(max_features=1000)

    X_train_bow = vectorizer.fit_transform(X_train["description"])
    X_val_bow = vectorizer.transform(X_val["description"])

    classifier_bow = RandomForestClassifier(n_estimators=400, criterion="entropy", random_state=42)

    classifier_bow.fit(X_train_bow, y_train)

    y_pred = classifier_bow.predict(X_val_bow)

    # add the results to the report
    report.loc[i] = [accuracy_score(y_val, y_pred), precision_score(y_val, y_pred), recall_score(y_val, y_pred), f1_score(y_val, y_pred)]

report.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,10.0,10.0,10.0,10.0
mean,0.86346,0.854615,0.874965,0.86412
std,0.012618,0.031878,0.021432,0.015304
min,0.843096,0.788235,0.834746,0.842767
25%,0.857741,0.840763,0.863762,0.852932
50%,0.86283,0.852822,0.87415,0.865975
75%,0.874932,0.877226,0.887402,0.8756
max,0.878407,0.894515,0.905405,0.884


In [None]:
# Get feature importances
importances = classifier_bow.feature_importances_

# Get the indices of features sorted by importance
sorted_indices = np.argsort(importances)[::-1]

# Get the feature names
feature_names = encoder.get_feature_names_out()

# Print the most important words
top_k = 10  # You can change this to get more or fewer top words
print(f"Top {top_k} important words:")
for i in range(top_k):
    idx = sorted_indices[i]
    print(f"Feature {i+1}: {feature_names[idx]} (Importance: {importances[idx]:.4f})")

NameError: name 'classifier_bow' is not defined