# Baseline model

In [1]:
import pandas as pd

# train_df = pd.read_csv('../data/processed/smish/train_df.csv', )
# test_df = pd.read_csv('../data/processed/smish/test_df.csv')
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [2]:
print(len(train_df))
print(len(test_df))

35331
7571


In [3]:
# import sys
# sys.path.append('/workspaces/hybrid-cse-system-v2-/')
# from src.hybrid_cse_system_v2.rbs.extractor import extract_rbs_features
from rbs.extractor import extract_rbs_features


train_ = []
test_ = []

for _, row in train_df.iterrows():
    features = extract_rbs_features(row['TEXT'])
    train_.append(features)

for _, row in test_df.iterrows():
    features = extract_rbs_features(row['TEXT'])
    test_.append(features)

In [4]:
X_rbs_train = pd.DataFrame(train_)
X_rbs_test = pd.DataFrame(test_)

In [5]:
X_rbs_train.head()

Unnamed: 0,url_count,has_url,suspicious_tld_count,phone_count,has_phone,known_smishing_phrase_hits,urgency_term_hits,reward_term_hits,text_length,token_estimate
0,0,0,0,0,0,0,0,0,139,30
1,0,0,0,0,0,1,0,0,499,79
2,0,0,0,0,0,0,0,1,187,41
3,0,0,0,0,0,0,1,0,53,11
4,0,0,0,0,0,0,0,0,92,17


# TD-IDF Text representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=1000,
    min_df=3
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['TEXT'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['TEXT'])

In [7]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(35331, 1000)
(7571, 1000)


In [8]:
len(tfidf_vectorizer.vocabulary_)

1000

In [9]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names[:-20])

['00' '000' '10' '100' '1000' '10k' '10p' '12' '123' '1234' '123456'
 '1234567890' '15' '150' '150p' '16' '18' '1st' '20' '200' '24' '25' '250'
 '2nd' '30' '456' '50' '500' '5000' '750' '7890' '800' 'abc' 'ac' 'access'
 'account' 'accounts' 'acount' 'act' 'action' 'activate' 'activities'
 'activity' 'actual' 'actually' 'ad' 'add' 'additional' 'address'
 'advantage' 'adventure' 'ah' 'ahead' 'ai' 'ake' 'al' 'alert' 'amazing'
 'amazon' 'amp' 'answer' 'app' 'appear' 'apply' 'ard' 'asap' 'ask' 'asked'
 'assistance' 'attached' 'attachments' 'attempt' 'attempts' 'attention'
 'authenticity' 'automated' 'available' 'ave' 'avoid' 'await' 'award'
 'awarded' 'away' 'awesome' 'ay' 'babe' 'bad' 'bait' 'balance' 'bank'
 'based' 'begin' 'believe' 'benefits' 'best' 'better' 'beware' 'bg' 'bi'
 'big' 'birthday' 'bit' 'bonus' 'book' 'box' 'boy' 'brand' 'brands'
 'broken' 'bt' 'bucks' 'buddy' 'business' 'buy' 'caim' 'cal' 'called'
 'calling' 'calls' 'camera' 'car' 'card' 'cards' 'care' 'carefully' 'cash'


In [10]:
# combine rbs features with tf-idf train values
from scipy.sparse import hstack

X_train_combined  = hstack([X_train_tfidf, X_rbs_train.values])
X_test_combined   = hstack([X_test_tfidf, X_rbs_test.values])

### Baseline System: RBS and Multinomial Naive Bayes

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class RBSTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, extractor):
    self.extractor = extractor

  def fit(self, X, y=None):
    return self

  def transform(self, x):
    features = [self.extractor(text) for text in x]
    df = pd.DataFrame(features)
    return df.values

In [12]:
tfidf_vectorizer2 = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),
    max_features=1000,
    min_df=3
)

In [13]:
from sklearn.pipeline import Pipeline, FeatureUnion

hybrid_features = FeatureUnion([
    ('tfidf', tfidf_vectorizer2),
    ('rbs', RBSTransformer(extract_rbs_features)),
])
pipeline = Pipeline([
    ('features', hybrid_features),
    ('classifier', MultinomialNB()),
])

In [14]:
y_train = train_df['LABEL']
y_test = test_df['LABEL']

y_train_mapped = y_train.map({'ham': 0, 'spam': 1, 'smish': 2})
y_test_mapped = y_test.map({'ham': 0, 'spam': 1, 'smish': 2})

In [15]:
pipeline.fit(train_df['TEXT'], y_train_mapped)

In [16]:
probs = pipeline.predict_proba(test_df['TEXT'])

In [17]:
probs[:3]

array([[0.4164868 , 0.2546004 , 0.3289128 ],
       [0.99549226, 0.001553  , 0.00295474],
       [0.09919849, 0.02958039, 0.87122111]])

In [18]:
y_test[:3]

Unnamed: 0,LABEL
0,ham
1,ham
2,smish


In [19]:
smish_risk = probs[:, 2]

In [20]:
smish_risk

array([0.3289128 , 0.00295474, 0.87122111, ..., 0.71223677, 0.08155653,
       0.04094618])

In [21]:
y_pred = pipeline.predict(test_df['TEXT'])
y_pred_proba = probs

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test_mapped, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      3625
           1       0.92      0.87      0.89      1797
           2       0.89      0.94      0.91      2149

    accuracy                           0.94      7571
   macro avg       0.93      0.93      0.93      7571
weighted avg       0.94      0.94      0.94      7571



In [23]:
conf_matrix = confusion_matrix(y_test_mapped, y_pred)
conf_matrix

array([[3500,   54,   71],
       [  48, 1567,  182],
       [  35,   91, 2023]])

In [24]:
# import numpy as np

# print("Mean risk for true smish:", np.mean(smish_risk[y_test == 2]))
# print("Mean risk for ham:", np.mean(smish_risk[y_test == 0]))
# print("Mean risk for spam:", np.mean(smish_risk[y_test == 1]))

In [25]:
from sklearn.metrics import roc_auc_score

binary_true = (y_test_mapped == 2).astype(int)
binary_scores = y_pred_proba[:, 2]

roc_auc_score(binary_true, binary_scores)

np.float64(0.9853939854159132)

### TF-IDF Only Pipeline
This is to verify if the RBS is actually helping the model or just adding noise

In [26]:
tfidf_only_pipeline = Pipeline([
    ("tfidf", tfidf_vectorizer2),
    ("classifier", MultinomialNB())
])

tfidf_only_pipeline.fit(train_df["TEXT"], y_train_mapped)

In [27]:
y_pred_tfidf = tfidf_only_pipeline.predict(test_df["TEXT"])
y_probs_tfidf = tfidf_only_pipeline.predict_proba(test_df["TEXT"])

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(classification_report(y_test_mapped, y_pred_tfidf))

binary_true = (y_test_mapped == 2).astype(int)
binary_scores_tfidf = y_probs_tfidf[:, 2]
print(roc_auc_score(binary_true, binary_scores_tfidf))

conf_matrix = confusion_matrix(y_test_mapped, y_pred_tfidf)
print(conf_matrix)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3625
           1       0.93      0.88      0.91      1797
           2       0.91      0.91      0.91      2149

    accuracy                           0.93      7571
   macro avg       0.93      0.92      0.93      7571
weighted avg       0.93      0.93      0.93      7571

0.9842082108995648
[[3522   38   65]
 [  74 1590  133]
 [ 110   76 1963]]


In detecting smish messages, the hybrid model scores a lower precison of 0.89 while the tfidf-only model scores a precision of 0.90. But in terms of recall, the hybrid model does better with a score of 0.95, while the tfidf model scores 0.93. This is likely because rules, which the hybrid has, are built to improve recall.

In [29]:
# checking for ovelaps(duplicates across data splits)

train_texts = set(train_df["TEXT"])
test_texts = set(test_df["TEXT"])

overlap = train_texts.intersection(test_texts)
len(overlap)

0

In [30]:
print(f"{(1105 / 8400):.2f}")

0.13
