In [95]:

import pandas as pd
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import re



In [89]:
df_scams=pd.read_csv('/Users/benedicthalim/Documents/DSA4264_Dating_Fraud/data/output_scam_profiles.csv')
df_real=pd.read_csv('/Users/benedicthalim/Documents/DSA4264_Dating_Fraud/data/output_real_profiles.csv')

In [98]:
df_scams=df_scams.dropna()
df_scams['age'] = df_scams['age'].str.extract(r'^(\d+)', expand=False)
df_scams['age'] = df_scams['age'].astype(int)

In [100]:
df_combined = pd.concat([df_scams, df_real], ignore_index=True)

In [101]:
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [102]:
df_combined

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam
0,33,"Bollebygd, Sweden",asian,other,separated,"I'm a new migrant in Sweden, if you would like...",0
1,25,"Barcelona, Spain",white,other,single,am a loving and understanding girl very simple...,1
2,31,"California, United States",white,fashion,single,I am a single woman with no kids and seeking f...,1
3,21,"New York, United States",black,other,single,"Few things about me, I am Miss Florence Beka.I...",1
4,53,"Lima, Perú",middle eastern,self-employed,single,"Home, very affectionate sexually discreet",0
...,...,...,...,...,...,...,...
5531,32,"San Juan de Miraflores, Perú",hispanic,legal,single,I am a super friendly person with whom you can...,0
5532,56,"Fayetteville, NC, USA",black,service,single,"I'm educated, I'm retired U.S. Navy and financ...",0
5533,36,"Villars-sur-Ollon, 1884 Ollon, Switzerland",white,self-employed,single,Hello my name is Steven i'm from St-Petersburg...,0
5534,57,"South Shields, UK",white,manager,divorced,"Honest, Loyal, Loving man seeking the same in ...",0


In [103]:

# bin the ages value in scam dataframe
bins = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
df_combined['age_group'] = pd.cut(df_combined['age'], bins=bins, labels=labels, right=False)
df_combined['age_group'] = df_combined['age_group'].astype('object')

In [104]:
df_combined

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group
0,33,"Bollebygd, Sweden",asian,other,separated,"I'm a new migrant in Sweden, if you would like...",0,31-40
1,25,"Barcelona, Spain",white,other,single,am a loving and understanding girl very simple...,1,21-30
2,31,"California, United States",white,fashion,single,I am a single woman with no kids and seeking f...,1,31-40
3,21,"New York, United States",black,other,single,"Few things about me, I am Miss Florence Beka.I...",1,21-30
4,53,"Lima, Perú",middle eastern,self-employed,single,"Home, very affectionate sexually discreet",0,51-60
...,...,...,...,...,...,...,...,...
5531,32,"San Juan de Miraflores, Perú",hispanic,legal,single,I am a super friendly person with whom you can...,0,31-40
5532,56,"Fayetteville, NC, USA",black,service,single,"I'm educated, I'm retired U.S. Navy and financ...",0,51-60
5533,36,"Villars-sur-Ollon, 1884 Ollon, Switzerland",white,self-employed,single,Hello my name is Steven i'm from St-Petersburg...,0,31-40
5534,57,"South Shields, UK",white,manager,divorced,"Honest, Loyal, Loving man seeking the same in ...",0,51-60


In [106]:
df_combined['country'] = df_combined['location'].apply(lambda x: x.split(',')[-1].strip())

In [107]:
categorical = [var for var in df_combined.columns if df_combined[var].dtype=='O']
for var in categorical:
    
    print(var, ' contains ', len(df_combined[var].unique()), ' labels')

location  contains  2789  labels
ethnicity  contains  13  labels
occupation  contains  46  labels
status  contains  6  labels
description  contains  5363  labels
age_group  contains  8  labels
country  contains  237  labels


In [123]:
y=df_combined['scam']
X=df_combined.drop(columns=['scam','description','age','location','country'])

In [134]:
X

Unnamed: 0,ethnicity,occupation,status,age_group
0,asian,other,separated,31-40
1,white,other,single,21-30
2,white,fashion,single,31-40
3,black,other,single,21-30
4,middle eastern,self-employed,single,51-60
...,...,...,...,...
5531,hispanic,legal,single,31-40
5532,black,service,single,51-60
5533,white,self-employed,single,31-40
5534,white,manager,divorced,51-60


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
import category_encoders as ce

In [137]:
categorical

['location',
 'ethnicity',
 'occupation',
 'status',
 'description',
 'age_group',
 'country']

In [138]:
X_train

Unnamed: 0,ethnicity,occupation,status,age_group
2559,white,military,single,31-40
1876,native american,military,single,51-60
1212,mixed,other,separated,41-50
831,asian,architect,single,41-50
1488,hispanic,housewife,single,41-50
...,...,...,...,...
3772,white,architect,widowed,41-50
5191,white,carer,single,31-40
5226,native american,business,divorced,41-50
5390,hispanic,self-employed,single,41-50


In [139]:
# encode remaining variables with one-hot encoding

encoder = ce.OneHotEncoder(cols=['ethnicity', 'occupation', 'status', 'age_group'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [140]:

cols = X_train.columns

In [141]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [142]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [143]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [144]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)

In [145]:
y_pred = gnb.predict(X_test)

y_pred

array([0, 1, 0, ..., 1, 0, 1])

In [146]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.7951
