# Baseline model

In [1]:
import pandas as pd

# train_df = pd.read_csv('../data/processed/smish/train_df.csv', )
# test_df = pd.read_csv('../data/processed/smish/test_df.csv')
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [2]:
# import sys
# sys.path.append('/workspaces/hybrid-cse-system-v2-/')
# from src.hybrid_cse_system_v2.rbs.extractor import extract_rbs_features
from rbs.extractor import extract_rbs_features


train_ = []
test_ = []

for _, row in train_df.iterrows():
    features = extract_rbs_features(row['TEXT'])
    train_.append(features)

for _, row in test_df.iterrows():
    features = extract_rbs_features(row['TEXT'])
    test_.append(features)

In [3]:
X_rbs_train = pd.DataFrame(train_)
X_rbs_test = pd.DataFrame(test_)

In [4]:
X_rbs_train.head()

Unnamed: 0,url_count,has_url,suspicious_tld_count,phone_count,has_phone,known_smishing_phrase_hits,urgency_term_hits,reward_term_hits,text_length,token_estimate
0,1,1,0,0,0,0,1,2,335,50
1,0,0,0,0,0,0,0,0,24,5
2,0,0,0,0,0,0,2,6,451,78
3,0,0,0,1,1,0,1,0,140,22
4,0,0,0,0,0,0,0,0,58,12


# TD-IDF Text representation

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=1000,
    min_df=3
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['TEXT'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['TEXT'])

In [11]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(39196, 1000)
(8400, 1000)


In [12]:
len(tfidf_vectorizer.vocabulary_)

1000

In [13]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names[:-20])

['00' '000' '10' '100' '1000' '10k' '10p' '12' '123' '1234' '1234567890'
 '15' '150' '150p' '150ppm' '16' '18' '1st' '20' '200' '2000' '24' '25'
 '250' '2nd' '30' '456' '50' '500' '5000' '750' '800' 'abc' 'able' 'ac'
 'access' 'account' 'accounts' 'acount' 'act' 'action' 'activate'
 'activities' 'activity' 'actual' 'actually' 'ad' 'add' 'additional'
 'address' 'advantage' 'adventure' 'ah' 'ahead' 'ai' 'aight' 'ake' 'al'
 'alert' 'amazing' 'amazon' 'ame' 'amp' 'answer' 'anytime' 'app' 'appear'
 'apply' 'ard' 'asap' 'ask' 'asked' 'attached' 'attachments' 'attempt'
 'attempts' 'attention' 'automated' 'available' 'ave' 'avoid' 'await'
 'award' 'awarded' 'away' 'awesome' 'ay' 'babe' 'baby' 'bad' 'bait'
 'balance' 'bank' 'based' 'begin' 'believe' 'benefits' 'best' 'bet'
 'better' 'beware' 'bg' 'bi' 'big' 'birthday' 'bit' 'bonus' 'book' 'box'
 'boy' 'brand' 'brands' 'bring' 'broken' 'bt' 'bucks' 'buddy' 'bus'
 'business' 'buy' 'caim' 'cal' 'called' 'calling' 'calls' 'came' 'camera'
 'car' 'ca

In [14]:
# combine rbs features with tf-idf train values
from scipy.sparse import hstack

X_train_combined  = hstack([X_train_tfidf, X_rbs_train.values])
X_test_combined   = hstack([X_test_tfidf, X_rbs_test.values])

### Baseline System: RBS and Multinomial Naive Bayes

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class RBSTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, extractor):
    self.extractor = extractor

  def fit(self, X, y=None):
    return self

  def transform(self, x):
    features = [self.extractor(text) for text in x]
    df = pd.DataFrame(features)
    return df.values

In [16]:
tfidf_vectorizer2 = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),
    max_features=1000,
    min_df=3
)

In [17]:
from sklearn.pipeline import Pipeline, FeatureUnion

hybrid_features = FeatureUnion([
    ('tfidf', tfidf_vectorizer2),
    ('rbs', RBSTransformer(extract_rbs_features)),
])
pipeline = Pipeline([
    ('features', hybrid_features),
    ('classifier', MultinomialNB()),
])

In [18]:
y_train = train_df['LABEL']
y_test = test_df['LABEL']

y_train_mapped = y_train.map({'ham': 0, 'spam': 1, 'smish': 2})
y_test_mapped = y_test.map({'ham': 0, 'spam': 1, 'smish': 2})

In [19]:
pipeline.fit(train_df['TEXT'], y_train_mapped)

In [20]:
probs = pipeline.predict_proba(test_df['TEXT'])

In [21]:
probs[:3]

array([[1.33115408e-10, 9.99567878e-01, 4.32122086e-04],
       [9.90251601e-01, 3.18725055e-03, 6.56114846e-03],
       [9.91072810e-01, 7.18805183e-03, 1.73913805e-03]])

In [22]:
y_test[:3]

Unnamed: 0,LABEL
0,spam
1,ham
2,ham


In [23]:
smish_risk = probs[:, 2]

In [25]:
smish_risk

array([4.32122086e-04, 6.56114846e-03, 1.73913805e-03, ...,
       9.99639177e-01, 1.16355479e-03, 5.12643104e-04])

In [26]:
y_pred = pipeline.predict(test_df['TEXT'])
y_pred_proba = probs

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test_mapped, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      4351
           1       0.92      0.86      0.89      1799
           2       0.89      0.95      0.92      2250

    accuracy                           0.94      8400
   macro avg       0.93      0.93      0.93      8400
weighted avg       0.94      0.94      0.94      8400



In [29]:
conf_matrix = confusion_matrix(y_test_mapped, y_pred)
conf_matrix

array([[4220,   62,   69],
       [  44, 1552,  203],
       [  37,   78, 2135]])

In [None]:
import numpy as np

print("Mean risk for true smish:", np.mean(smish_risk[y_test == 2]))
print("Mean risk for ham:", np.mean(smish_risk[y_test == 0]))
print("Mean risk for spam:", np.mean(smish_risk[y_test == 1]))