## Natural Language Processing and Support Vector Machines project

In [7]:
! pip install pandas
! pip install sklearn

Collecting pandas
  Downloading pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting pytz>=2020.1
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Collecting numpy>=1.18.5
  Using cached numpy-1.23.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.23.1 pandas-1.4.3 pytz-2022.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64

In [10]:
# Import libraries

import pandas as pd
import pickle
import numpy as np
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

**1. Load dataset and do the necessary transformations**

In [11]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

In [12]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [13]:
df_raw.sample(10)

Unnamed: 0,url,is_spam
2741,https://www.theskimm.com/picks/genius-products...,False
1879,https://www.morningbrew.com/daily/refer-a-friend,True
1959,https://www.cnbc.com/2020/06/27/the-facebook-a...,False
2856,https://numlock.substack.com/p/numlock-sunday-...,True
1483,http://us2.forward-to-friend.com/forward,True
143,https://www.realclearpolitics.com/articles/202...,False
2224,https://www.creosotemaps.com/blm2020/,False
2147,https://thehustle.co/06302020-Coupang-ecommerce/,False
2035,https://www.youtube.com/watch?v=S97MaG3kOMY&fe...,False
2250,https://www.vox.com/recode/2020/6/30/21287053/...,False


In [14]:
df_raw['is_spam'].value_counts()

False    2303
True      696
Name: is_spam, dtype: int64

In [15]:
df_interin = df_raw.copy()

In [19]:
# Check duplicates

print('Number of duplicated rows:',df_interin.duplicated().sum())

df_interin = df_interin.drop_duplicates().reset_index(drop = True)

Number of duplicated rows: 630


In [20]:
df_interin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2369 entries, 0 to 2368
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2369 non-null   object
 1   is_spam  2369 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 20.9+ KB


In [22]:
df_interin['is_spam'].value_counts()

False    2125
True      244
Name: is_spam, dtype: int64

In [24]:
# functions to clean the text

def comas(text):
    """
    Elimina comas del texto
    """
    return re.sub(',', ' ', text)

def espacios(text):
    """
    Elimina enters dobles por un solo enter
    """
    return re.sub(r'(\n{2,})','\n', text)

def minuscula(text):
    """
    Cambia mayusculas a minusculas
    """
    return text.lower()

def numeros(text):
    """
    Sustituye los numeros
    """
    return re.sub('([\d]+)', ' ', text)

def caracteres_no_alfanumericos(text):
    """
    Sustituye caracteres raros, no digitos y letras
    Ej. hola 'pepito' como le va? -> hola pepito como le va
    """
    return re.sub("(\\W)+"," ",text)

def comillas(text):
    """
    Sustituye comillas por un espacio
    Ej. hola 'pepito' como le va? -> hola pepito como le va?
    """
    return re.sub("'"," ", text)

def palabras_repetidas(text):
    """
    Sustituye palabras repetidas

    Ej. hola hola, como les va? a a ustedes -> hola, como les va? a ustedes
    """
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

def esp_multiple(text):
    """
    Sustituye los espacios dobles entre palabras
    """
    return re.sub(' +', ' ',text)


def url(text):
    """
    Remove https
    """
    return re.sub(r'(https://www|https://)', '', text)

In [25]:
# clean url

df_interin['url_limpia'] = df_interin['url'].apply(url).apply(caracteres_no_alfanumericos).apply(esp_multiple)

In [26]:
df_interin.head()

Unnamed: 0,url,is_spam,url_limpia
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,True,hvper com
2,https://briefingday.com/m/v4n3i4f3,True,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,True,briefingday com fan


In [27]:
# Transform target variable
df_interin['is_spam'] = df_interin['is_spam'].apply(lambda x: 1 if x == True else 0)

**2. NLP techniques to preprocess data before modeling**

In [28]:
df = df_interin.copy()

In [48]:
X = df['url_limpia']

y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y, random_state=2207)

In [38]:
len(X_train)

1776

In [42]:
len(X_test)

593

In [49]:
vec = CountVectorizer()

X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [45]:
X_train.shape

(1776, 5650)

In [46]:
X_test.shape

(593, 5650)

**3. SVM to construct a classifier for URLs**

3.1 No hyperparameter tuning

In [56]:
classifier = SVC(C = 1.0, kernel = 'linear', gamma = 'auto')

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       532
           1       0.71      0.64      0.67        61

    accuracy                           0.94       593
   macro avg       0.83      0.80      0.82       593
weighted avg       0.93      0.94      0.93       593



3.2 Hyperparameter tuning with GridSearchCV

In [51]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(random_state=1234),param_grid,verbose=2)

grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   5.7s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   5.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   5.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   5.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.9s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   1.6s
[CV] END .....................C=0.1, gamma=1, k

In [52]:
# Best hyperparameters

grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [53]:
# Best model

grid.best_estimator_

In [55]:
# Performance of best_estimator

pred_grid = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, pred_grid))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       532
           1       0.67      0.64      0.66        61

    accuracy                           0.93       593
   macro avg       0.82      0.80      0.81       593
weighted avg       0.93      0.93      0.93       593



In [57]:
# The performance (accuracy and f1 for classes 0 and 1) in the test set 
# was better in the previous model, with the ad-hoc hyperparameters
# Save it as best model

best_model = classifier

# Save it for future use

pickle.dump(best_model, open('../models/best_model.pickle', 'wb')) # save the model
# modelo = pickle.load(open('../models/best_model.pickle', 'rb')) # read the model in the future