In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
import pandas_profiling

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

from urllib.parse import urlparse
from nltk.tokenize import RegexpTokenizer 
from nltk.stem.snowball import SnowballStemmer

In [3]:
pd.set_option('display.max_colwidth', None)

In [36]:
def split_url(url):
    tokenizer = RegexpTokenizer(r'[A-Za-z]+')
    stemmer = SnowballStemmer(language="english")
    sent = tokenizer.tokenize(url)
    sent = [stemmer.stem(word) for word in sent]
    sent = ' '.join(sent)
    return sent

def add_features(df):
    df['label'] = df['label'].apply(lambda label: 1 if label == 'Unsafe' else 0)
    df['stemmed'] = df['url'].apply(split_url)
    df['words_count'] = df['stemmed'].apply(lambda url: len(url.split()))
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df = 3, lowercase=True, max_features=150000, stop_words=['url', 'label', 'stemmed', 'words_count', 'len'])
    url_vec = vectorizer.fit_transform(df['stemmed'])
    temp_df = pd.DataFrame(url_vec.toarray(), columns=vectorizer.get_feature_names())
    df['len'] = df['url'].apply(lambda url: len(url))
    df = pd.concat([df, temp_df], axis=1)
    return df
    
def split(df):
    train_text, temp_text, \
    train_labels, temp_labels = train_test_split(df.drop(['url', 'stemmed'], axis=1),   
                                                 df['label'],
                                                 random_state = 42,
                                                 test_size = 0.3,
                                                 stratify=df['label'])
    val_text, test_text, \
    val_labels, test_labels = train_test_split(temp_text,
                                              temp_labels,
                                              random_state = 42,
                                              test_size = 0.5,
                                              stratify=temp_labels)
   
    return train_text.drop(['label'], axis=1), \
           train_labels, \
           val_text.drop(['label'], axis=1), \
           val_labels, \
           test_text.drop(['label'], axis=1), \
           test_labels

def preprocess(df):
    df = add_features(df)
    return split(df)

def train_gbm(train_data, train_labels, val_data, val_labels, test_data, test_labels):
    gbm = CatBoostClassifier(task_type="GPU", logging_level='Silent', 
                             loss_function='Logloss',od_type='Iter')
    eval_pool = Pool(val_data, val_labels)
    gbm.fit(train_data, train_labels, eval_set=eval_pool, use_best_model=True)
    gbm.save_model('catboost_1',
                   format="cbm",
                   export_parameters=None,
                   pool=None)
    pred_labels = gbm.predict(test_data)
    score = roc_auc_score(pred_labels, test_labels)
    print(score)
    return gbm

def get_cross_val(model, train_data, train_labels):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, train_data, train_labels, cv=skf, scoring='roc_auc')
    print(cv_results)

In [5]:
df = pd.read_csv('urls.csv')
train_data, train_labels, val_data, val_labels, test_data, test_labels = preprocess(df)

In [None]:
gbm = CatBoostClassifier(task_type="GPU", logging_level='Silent', loss_function='Logloss',
                        od_type='Iter')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(model, train_data, train_labels, cv=skf, scoring='roc_auc')
print(cv_results)

In [37]:
gbm = train_gbm(train_data, train_labels, val_data, val_labels, test_data, test_labels)

0.9696997605254716
