In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.metrics import f1_score  #ytrue, ypred
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# NLP toolkits
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def grid_SVC(X_train, y_train, performance_metric='f1', resultsGrid=False):
    model = SVC()
    C = np.linspace(0.000001 , 1000, 10)
    kernels = ['poly', 'rbf', 'linear', 'sigmoid']
    gamma = ['scale', 'auto']
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_


def lemmatize_text(text): #Lematización del texto.
    tokens = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text



def clean_text(string):
    string = string.lower()
    string = re.sub(r"http(s)?:*", '', string)
    string = re.sub(r"[-/.#&]", ' ', string)
    string = re.sub(r"w{3}", ' ', string)
    string = string.strip()
    string = ' '.join([word for word in string.split() if word not in stop_words])
    string = lemmatize_text(string)
    return string

In [3]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"

In [4]:
df  = pd.read_csv(url)

In [5]:
samples = [df['url'].loc[np.random.randint(0,df.shape[0])] for _ in range(30)]

In [6]:
samples

['https://www.morningbrew.com/account/qUyyLJmF4fNg2Xyvy89idCPX/edit',
 'https://briefingday.com/n/20200701/m',
 'https://www.hollywoodreporter.com/news/bellagio-mgm-resorts-las-vegas-require-guests-wear-face-masks-1300314',
 'https://www.morningbrew.com/daily/stories/2020/06/09/employees-demand-companies-fight-racism',
 'https://janefriedman.us2.list-manage.com/unsubscribe',
 'https://docs.google.com/forms/d/e/1FAIpQLSew3gtG3kLZYHeQK1KPSx-GrsgSOSA-_XgSR8rm8lrZyduqBw/viewform',
 'https://shop.morningbrew.com/',
 'https://www.theatlantic.com/ideas/archive/2020/06/dudes-who-wont-wear-masks/613375/',
 'https://techcrunch.com/2020/06/26/tim-oreilly-makes-a-persuasive-case-for-why-venture-capital-is-starting-to-do-more-harm-than-good/',
 'https://www.liveintent.com/ad-choices/',
 'https://mashable.com/article/amazon-prime-video-watch-parties-feature/',
 'https://www.nytimes.com/2020/06/18/us/american-airlines-mask-brandon-straka.html',
 'https://www.joinhoney.com/',
 'https://mailchi.mp/aust

In [7]:
X, y = df['url'], df['is_spam']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=True,
                                                    test_size = 0.3,
                                                    random_state=123)

In [9]:
X_train

1099    https://www.reuters.com/article/us-usa-trump-o...
1448    https://www.eventbrite.com/e/big-friendship-bo...
2327    https://news.rice.edu/2020/06/29/laser-welded-...
1412    https://creativemornings.com/companies/sdco-pa...
1224    https://www.nytimes.com/2020/06/25/world/afric...
                              ...                        
1147    https://en.wikipedia.org/wiki/Tim_O%27Brien_(a...
2154    https://www.washingtonpost.com/privacy-policy/...
1766    https://www.cnbc.com/2020/06/26/amazon-buys-se...
1122    https://www.amazon.com/Rivers-Tides-Andy-Golds...
1346            https://www.gao.gov/assets/710/707839.pdf
Name: url, Length: 2099, dtype: object

In [10]:
# cleaning data
X_train = X_train.apply(lambda x : clean_text(x))
X_test = X_test.apply(lambda x : clean_text(x))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/vscode/nltk_data'
    - '/usr/local/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
