In [30]:
import pandas as pd

real = pd.read_csv('./spam_text_data/real_message.csv')
scam = pd.read_csv("./spam_text_data/scam_message.csv")
test = pd.read_csv("./spam_text_data/test_all.csv")

# Preprocessing

In [31]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download('punkt_tab')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/avinanakarmi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/avinanakarmi/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/avinanakarmi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/avinanakarmi/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avinanakarmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
from nltk.tree import Tree
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Load stopwords
    words = word_tokenize(text)  # Tokenize text
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_words)
    
def remove_names_and_orgs(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    named_entities = ne_chunk(pos_tags, binary=False)
    
    filtered_words = []
    
    for chunk in named_entities:
        if isinstance(chunk, nltk.Tree):
            entity_name = " ".join(c[0] for c in chunk)
            entity_type = chunk.label()
            if entity_type in ["PERSON", "ORGANIZATION"]:  # Remove people and organizations
                filtered_words.append("[REDACTED]")
            else:
                filtered_words.append(entity_name)
        else:
            filtered_words.append(chunk[0])
    
    return " ".join(filtered_words)

In [35]:
def preprocess(text):
    cleaned_text = remove_person_org_entities(text)
    cleaned_text = remove_stopwords(text)
    return text.lower()


real["message"] = real["message"].apply(preprocess)
scam["message"] = scam["message"].apply(preprocess)
test["message"] = test["message"].apply(preprocess)
real.head()

Unnamed: 0,id,scam_flag,message
0,1,0,alert: homeowners insurance premium of $600 du...
1,2,0,auto-debit alert: your netflix subscription ($...
2,3,0,electricity bill due april 10. amount: $125. p...
3,4,0,"final notice: electricity bill due tomorrow, a..."
4,5,0,"friendly reminder: your rent payment of $1,200..."


In [36]:
bg = pd.read_csv("./spam_text_data/combo_message.csv")
bg["message"] = bg["message"].apply(preprocess)

# fit vectorizer

In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


documents = scam["message"] + real["message"]

vectorizer = TfidfVectorizer()

vectorizer.fit_transform(documents)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4970 stored elements and shape (200, 1164)>

# Vectorize

In [38]:
scam_tfidf_matrix = vectorizer.transform(scam["message"])
df_tfidf = pd.DataFrame(scam_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

df_tfidf

Unnamed: 0,00,000,10,11,110,112679,114998,12,120,123,...,www,xyz,year,yes,you,your,yours,zelle,zen,zpass
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.367835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.100986,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.190275,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.165270,0.079007,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.081516,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.159764,0.0,0.0,0.0,0.0
196,0.0,0.274081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
197,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.067747,0.0,0.0,0.0,0.0
198,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.143759,0.0,0.0,0.0,0.0


In [39]:
df_tfidf.to_csv('./spam_text_data/scam_message_vector.csv', index=False)

In [40]:
real_tfidf_matrix = vectorizer.transform(real["message"])
df_tfidf = pd.DataFrame(real_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

df_tfidf

Unnamed: 0,00,000,10,11,110,112679,114998,12,120,123,...,www,xyz,year,yes,you,your,yours,zelle,zen,zpass
0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.289386,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.077556,0.0,0.0,0.0,0.0
2,0.0,0.0,0.253844,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.259553,0.0,0.453305,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.076804,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.173218,0.0,0.0,0.0,0.0
196,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.172626,0.0,0.0,0.0,0.0
197,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.161958,0.0,0.0,0.0,0.0
198,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.169426,0.0,0.0,0.0,0.0


In [41]:
df_tfidf.to_csv('./spam_text_data/real_message_vector.csv', index=False)

In [42]:
test_tfidf_matrix = vectorizer.transform(test["message"])
df_tfidf = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

df_tfidf

Unnamed: 0,00,000,10,11,110,112679,114998,12,120,123,...,www,xyz,year,yes,you,your,yours,zelle,zen,zpass
0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.408541,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.109489,0.0,0.0,0.0,0.0
1,0.0,0.0,0.253844,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.259553,0.0,0.453305,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.076804,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
109,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.114350,0.0,0.0,0.0,0.0
110,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
111,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.074505,0.0,0.0,0.0,0.0


In [43]:
df_tfidf.to_csv('./spam_text_data/test_message_vector.csv', index=False)

In [44]:
bg_tfidf_matrix = vectorizer.transform(bg["message"])
df_tfidf = pd.DataFrame(bg_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

df_tfidf

Unnamed: 0,00,000,10,11,110,112679,114998,12,120,123,...,www,xyz,year,yes,you,your,yours,zelle,zen,zpass
0,0.0,0.000000,0.253844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.095160,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.218035,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
196,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.064778,0.0,0.0,0.0,0.0
197,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
198,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.159764,0.0,0.0,0.0,0.0


In [45]:
df_tfidf.to_csv('./spam_text_data/bg_message_vector.csv', index=False)