In our last exploring NLP notebook we built an email spam detector using Natural Language Processing techniques and the Support Vector Machine (SVM) algorithm for classification.
In this project, we will again build a spam detector but this time using URLs instead of emails.

In [4]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
import nltk

from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import model_selection, svm
from sklearn.metrics import classification_report, accuracy_score

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [7]:
df.sample(10)

Unnamed: 0,url,is_spam
1175,https://www.zooniverse.org/projects/hiro-ono/a...,False
465,https://www.theskimm.com/skimmpicks,True
2245,https://www.voxmedia.com/legal/terms-of-use,True
328,https://shop.morningbrew.com/,True
2847,https://numlock.substack.com/p/numlock-sunday-...,True
2169,https://www.propublica.org/article/how-north-c...,False
1699,https://www.bbc.com/worklife/article/20200624-...,False
912,https://qz.com/quartzy/1300989/the-best-way-to...,False
2677,https://www.cnn.com/2020/06/28/business/facebo...,False
1746,https://thehustle.co/,True


In [8]:
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == True else 0) #columna categorica la pasamos a 0 y 1

In [10]:
print("Cant spam: " +str(len(df.loc[df.is_spam==1])))
print("Cant q no es spam: " +str(len(df.loc[df.is_spam==0])))
print(df.shape)
#df['is_spam'] = df['is_spam'].astype(int) #otra forma!

Cant spam: 244
Cant q no es spam: 2125
(2369, 2)


In [9]:
df = df.drop_duplicates()
df = df.reset_index(inplace = False)[['url','is_spam']]

In [11]:
df['url'] = df['url'].str.lower() 


In [13]:
stop=stopwords.words('english')

def remove_stopwords(message):
    if message is not None:
        words = message.strip().split()
        words_filtered = []
        for word in words:
            if word not in stop:
                words_filtered.append(word)
        result = " ".join(words_filtered)  
    else:
        result=None
    return result

def clean_text_digits(texto):
    #'''Match all digits in the string and replace them by empty string.'''
    if texto is not None:
        pattern1 = r'[0-9]'
        pattern2 = '[^a-zA-Z]'
        pattern3 = "(\\d|\\W)+"
        pattern4 = r'http(s)'
        result = re.sub(pattern1, '', texto)
        result = re.sub(pattern2, '', result)
        result = re.sub(pattern3, '', result)
        result = re.sub(pattern4, '', result)
    else:
        result=None
    return result

#df['url']=df['url'].apply(remove_stopwords)
df['url']=df['url'].apply(clean_text_digits)

#df['url'] = df['url'].str.replace(r'http(s)', '', regex=True)

In [14]:
df['url'] 

0                   briefingdayuslistmanagecomunsubscribe
1                                             wwwhvpercom
2                                     briefingdaycommvnif
3                             briefingdaycomnmcommentform
4                                       briefingdaycomfan
                              ...                        
2364    wwwthevergecomdisneydeepfakefaceswappingresear...
2365    wwwsmartcitiesworldnetnewsnewsdeepfaketechnolo...
2366             techcrunchcomanoptimisticviewofdeepfakes
2367    wwwtechnologyreviewcomthisstartupclaimsitsdeep...
2368                              wwwbbccomnewstechnology
Name: url, Length: 2369, dtype: object

In [15]:
#limpio
def normalize_string(text_string):
    if text_string is not None:
        result = unicodedata.normalize('NFD',text_string).encode('ascii','ignore').decode()
    else:
        result = None
    return result

In [16]:
df['len_url'] = df['url'].apply(lambda x : len(x))
df['contains_subscribe'] = df['url'].apply(lambda x : 1 if "subscribe" in x else 0)
df['contains_hash'] = df['url'].apply(lambda x : 1 if "#" in x else 0)
df['num_digits'] = df['url'].apply(lambda x : len("".join(_ for _ in x if _.isdigit())) )
df['non_https'] = df['url'].apply(lambda x : 1 if "https" in x else 0)
df['num_words'] = df['url'].apply(lambda x : len(x.split("/")))

target = 'is_spam'
features = [f for f in df.columns if f not in ["url", target]]
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=0)

In [17]:
message_vectorizer = CountVectorizer().fit_transform(df['url'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(message_vectorizer, df['is_spam'], test_size = 0.45, random_state = 42, shuffle = True)

In [19]:
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [20]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       959
           1       0.50      0.01      0.02       108

    accuracy                           0.90      1067
   macro avg       0.70      0.50      0.48      1067
weighted avg       0.86      0.90      0.85      1067



In [21]:
print("SVM Accuracy Score -> ",accuracy_score(predictions, y_test)*100)

SVM Accuracy Score ->  89.87816307403936


In [22]:
svm_accuracy_score = round(accuracy_score(predictions, y_test)*100,2)

print(f'Our model achieved {svm_accuracy_score}% accuracy!')

Our model achieved 89.88% accuracy!


Our model achieved 93% accuracy!