In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

from sklearn.naive_bayes import MultinomialNB



In [6]:
file_path = 'Scam _ Not scam 200 - Sheet1.csv'
data = pd.read_csv(file_path, sep=',')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   about_me         200 non-null    object
 1   label            200 non-null    int64 
 2   contact details  200 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ KB


In [7]:
data.head(7)

Unnamed: 0,about_me,label,contact details
0,A︇︆︈l︇︆︈l︇︆︈ ︇︆︈m︇︆︈y︇︆︈ ︇︆︈s︇︆︈e︇︆︈x︇︆︈y︇︆︈ ︇...,1,1
1,solo busco que me traten como una princesa y m...,1,1
2,buenas tardes yo busco conocer personas yo qui...,1,0
3,Tex бO78OѲ4らч8,1,1
4,"soy madre soltera tengo un hijo de 13 años, so...",1,0
5,Мне 16 если хочешь познакомиться пиши мне в Ин...,1,1
6,"bin für ein Sextreffen zu haben, wenn du Inter...",1,0


In [23]:
# data['about_me_cleaned'] = data['about_me'].str.replace(r'[^a-zA-Zа-яА-Я\s]', '', regex=True).str.lower()
data['about_me_cleaned'] = data['about_me'].str.replace(r'[^\w\s]', '', regex=True).str.lower()
data.head(7)

Unnamed: 0,about_me,label,contact details,about_me_cleaned
0,A︇︆︈l︇︆︈l︇︆︈ ︇︆︈m︇︆︈y︇︆︈ ︇︆︈s︇︆︈e︇︆︈x︇︆︈y︇︆︈ ︇...,1,1,all my sexy photos and videos
1,solo busco que me traten como una princesa y m...,1,1,solo busco que me traten como una princesa y m...
2,buenas tardes yo busco conocer personas yo qui...,1,0,buenas tardes yo busco conocer personas yo qui...
3,Tex бO78OѲ4らч8,1,1,tex бo78oѳ4らч8
4,"soy madre soltera tengo un hijo de 13 años, so...",1,0,soy madre soltera tengo un hijo de 13 años soy...
5,Мне 16 если хочешь познакомиться пиши мне в Ин...,1,1,мне 16 если хочешь познакомиться пиши мне в ин...
6,"bin für ein Sextreffen zu haben, wenn du Inter...",1,0,bin für ein sextreffen zu haben wenn du intere...


In [31]:
# text to nums - TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
X_text = vectorizer.fit_transform(data['about_me_cleaned']).toarray()

X = pd.DataFrame(X_text)
X.columns = X.columns.astype(str)
X['contact_details'] = data['contact details'].values
y = data['label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [32]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_val_pred = nb_model.predict(X_val)
y_val_prob = nb_model.predict_proba(X_val)[:, 1]

accuracy_val = accuracy_score(y_val, y_val_pred)
roc_auc_val = roc_auc_score(y_val, y_val_prob)
classification_report_val = classification_report(y_val, y_val_pred)

print(f"accuracy_val: {accuracy_val}\n\nroc_auc_val: {roc_auc_val}\n\nclassification_report_val:\n\n{classification_report_val}")

accuracy_val: 0.75

roc_auc_val: 0.7777777777777778

classification_report_val:

              precision    recall  f1-score   support

           0       0.75      0.67      0.71         9
           1       0.75      0.82      0.78        11

    accuracy                           0.75        20
   macro avg       0.75      0.74      0.74        20
weighted avg       0.75      0.75      0.75        20



In [33]:
log_reg = LogisticRegression(max_iter=500, solver='liblinear')
log_reg.fit(X_train, y_train)

y_val_pred = log_reg.predict(X_val)
y_val_prob = log_reg.predict_proba(X_val)[:, 1]

accuracy_val = accuracy_score(y_val, y_val_pred)
roc_auc_val = roc_auc_score(y_val, y_val_prob)
classification_report_val = classification_report(y_val, y_val_pred)

print(f"accuracy_val: {accuracy_val}\n\nroc_auc_val: {roc_auc_val}\n\nclassification_report_val:\n\n{classification_report_val}")

accuracy_val: 0.75

roc_auc_val: 0.7777777777777778

classification_report_val:

              precision    recall  f1-score   support

           0       0.70      0.78      0.74         9
           1       0.80      0.73      0.76        11

    accuracy                           0.75        20
   macro avg       0.75      0.75      0.75        20
weighted avg       0.76      0.75      0.75        20

