In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph.
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV

In [2]:
dataset = pd.read_csv('website_classification.csv')
dataset.shape

(1408, 4)

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [4]:
df = dataset[['website_url','cleaned_website_text','Category']].copy()
df.head()

Unnamed: 0,website_url,cleaned_website_text,Category
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [5]:
# Create a new column 'category_id' with encoded categories
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates()


# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

# New dataframe
df.head()

Unnamed: 0,website_url,cleaned_website_text,Category,category_id
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,0
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,0
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,0
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,0
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,0


## Vectorize Websites Classifications
Prepare text description to train the model

In [6]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')

# We transform each cleaned_text into a vector
features = tfidf.fit_transform(df.cleaned_website_text).toarray()

labels = df.category_id

print("Each of the %d text is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 1408 text is represented by 18865 features (TF-IDF score of unigrams and bigrams)


Spliting the data into train and test sets The original data was divided into features (X) and target (y), which were then splitted into train (75%) and test (25%) sets. Thus, the algorithms would be trained on one set of data and tested out on a completely different set of data (not seen before by the algorithm).

In [7]:
X = df['cleaned_website_text'] # Collection of text
y = df['Category'] # Target or the labels we want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state = 0)

In [8]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    GaussianNB()
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df



Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.719858
1,RandomForestClassifier,1,0.751773
2,RandomForestClassifier,2,0.716312
3,RandomForestClassifier,3,0.736655
4,RandomForestClassifier,4,0.679715
5,LinearSVC,0,0.858156
6,LinearSVC,1,0.932624
7,LinearSVC,2,0.939716
8,LinearSVC,3,0.903915
9,LinearSVC,4,0.879004


In [9]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1,
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GaussianNB,0.728651,0.055462
LinearSVC,0.902683,0.034687
MultinomialNB,0.852979,0.034688
RandomForestClassifier,0.720863,0.027017


In [10]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,
                                                               labels,
                                                               df.index, test_size=0.25,
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# calibrated_svc = CalibratedClassifierCV(base_estimator=model,
#                                         cv="prefit")

# print(X_train, y_train)

# calibrated_svc.fit(X_train,y_train)
# predicted = calibrated_svc.predict(X_test)
# print(metrics.accuracy_score(y_test, predicted))



In [11]:
model.fit(features, labels)

N = 4
for Category, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("\n==> '{}':".format(Category))
  print("  * Top unigrams: %s" %(', '.join(unigrams)))
  print("  * Top bigrams: %s" %(', '.join(bigrams)))




==> 'Adult':
  * Top unigrams: bdsm, sex, erotic, bondage
  * Top bigrams: erotic story, sex toy, sex toys, live sex

==> 'Business/Corporate':
  * Top unigrams: investment, client, company, business
  * Top bigrams: real estate, audio visual, year experience, site work

==> 'Computers and Technology':
  * Top unigrams: software, file, web, windows
  * Top bigrams: source code, web hosting, come soon, easy use

==> 'E-Commerce':
  * Top unigrams: shipping, grocery, item, gift
  * Top bigrams: free shipping, gift card, grocery shopping, add cart

==> 'Education':
  * Top unigrams: university, science, student, chemistry
  * Top bigrams: open access, multiple choice, annual meeting, state university

==> 'Food':
  * Top unigrams: recipe, cake, restaurant, tasty
  * Top bigrams: order online, gluten free, home menu, breakfast lunch

==> 'Forums':
  * Top unigrams: forums, forum, answers, google
  * Top bigrams: share knowledge, mon dec, hour ago, post hour

==> 'Games':
  * Top unigrams:

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, df['category_id'],
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

m = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

m1 = CalibratedClassifierCV(estimator = m, cv="prefit").fit(tfidf_vectorizer_vectors, y_train)



## Scraping Tool

In this section, I am building a scraping tool to scrape the description of domain and use the model to predict its category

In [13]:
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import os
class ScrapTool:
    def visit_url(self, website_url):
        '''
        Visit URL. Download the Content. Initialize the beautifulsoup object. Call parsing methods. Return Series object.
        '''
        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
        content = requests.get(website_url,timeout=60).content

        #lxml is apparently faster than other settings.
        soup = BeautifulSoup(content, "lxml")
        result = {
            "website_url": website_url,
            "website_name": self.get_website_name(website_url),
            "website_text": self.get_html_title_tag(soup)+self.get_html_meta_tags(soup)+self.get_html_heading_tags(soup)+
                                                               self.get_text_content(soup)
        }

        #Convert to Series object and return
        return pd.Series(result)

    def get_website_name(self,website_url):
        '''
        Example: returns "google" from "www.google.com"
        '''
        return "".join(urlparse(website_url).netloc.split(".")[-2])

    def get_html_title_tag(self,soup):
        '''Return the text content of <title> tag from a webpage'''
        return '. '.join(soup.title.contents)

    def get_html_meta_tags(self,soup):
        '''Returns the text content of <meta> tags related to keywords and description from a webpage'''
        tags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
        content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords','description']]
        return ' '.join(content)

    def get_html_heading_tags(self,soup):
        '''returns the text content of heading tags. The assumption is that headings might contain relatively important text.'''
        tags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return ' '.join(content)

    def get_text_content(self,soup):
        '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
        tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
        tags = soup.find_all(text=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if tag.parent.name not in tags_to_ignore\
                and isinstance(tag, bs4.element.Comment)==False\
                and not stripped_tag.isnumeric()\
                and len(stripped_tag)>0:
                result.append(stripped_tag)
        return ' '.join(result)

import spacy as sp
from collections import Counter
sp.prefer_gpu()
import en_core_web_sm
#anconda prompt ko run as adminstrator and copy paste this:python -m spacy download en
nlp = en_core_web_sm.load()
import re
def clean_text(doc):
    '''
    Clean the document. Remove pronouns, stopwords, lemmatize the words and lowercase them
    '''
    doc = nlp(doc)
    tokens = []
    exclusion_list = ["nan"]
    for token in doc:
        if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum()==False) or token.text in exclusion_list :
            continue
        token = str(token.lemma_.lower().strip())
        tokens.append(token)
    return " ".join(tokens)

## Labeling Clickstreams Data

In this section, I am iterating over every tld to label its category. I am counting only those websited which streaming time was more then 5 seconds.

In [14]:
website='https://icloud.com'
scrapTool = ScrapTool()
try:
    web=dict(scrapTool.visit_url(website))
    text=(clean_text(web['website_text']))
    t=fitted_vectorizer.transform([text])
    print(id_to_category[m1.predict(t)[0]])
    data=pd.DataFrame(m1.predict_proba(t)*100,columns=df['Category'].unique())
    data=data.T
    data.columns=['Probability']
    data.index.name='Category'
    a=data.sort_values(['Probability'],ascending=False)
    a['Probability']=a['Probability'].apply(lambda x:round(x,2))
except:
    print("Connection Timedout!")

Social Networking and Messaging


  tags = soup.find_all(text=True)


In [15]:
#read in the data
clickstreams = pd.read_hdf('click_df.h5')
clickstreams.head()

Unnamed: 0,mdn_hash,clickDuration,tld,total_data,zipcode
0,f1d33b2f73242334,120,ibytedtos.com,4915,94901
1,eb5da755091e7756,1,apple.com,8821,94301
2,8316e5886a85ad58,134,tiktokcdn-us.com,1537,93314
3,1d2f9134899da04e,31,apple.com,7750,95670
4,ba3e39bb4b8a9091,0,apple.com,47814,92805


In this section, I am creating a column `category`, which would be `None` by default. Then, I will use the m1 model to asssign categories to them.

In [16]:
tlds = clickstreams['tld'].unique()

#create a dataframe that would store category for every tld
categories = {'tld': tlds, 'category': [None] * len(tlds)}
categories_df = pd.DataFrame(categories)
categories_df.head()


Unnamed: 0,tld,category
0,ibytedtos.com,
1,apple.com,
2,tiktokcdn-us.com,
3,garmin.com,
4,icloud.com,


In [43]:
website='https://' + tlds[24186]
scrapTool = ScrapTool()
try:
    web=dict(scrapTool.visit_url(website))
    text=(clean_text(web['website_text']))
    t=fitted_vectorizer.transform([text])
    print(id_to_category[m1.predict(t)[0]])
    data=pd.DataFrame(m1.predict_proba(t)*100,columns=df['Category'].unique())
    data=data.T
    data.columns=['Probability']
    data.index.name='Category'
    a=data.sort_values(['Probability'],ascending=False)
    a['Probability']=a['Probability'].apply(lambda x:round(x,2))
except:
    print("Connection Timedout!")

Connection Timedout!


In [19]:
for tld in tlds:
  website='https://' + str(tld)
  scrapTool = ScrapTool()
  try:
      web=dict(scrapTool.visit_url(website))
      text=(clean_text(web['website_text']))
      t=fitted_vectorizer.transform([text])
      category = id_to_category[m1.predict(t)[0]]
      print(id_to_category[m1.predict(t)[0]])
      categories_df.loc[categories_df['tld'] == tld, 'category'] = category
      data=pd.DataFrame(m1.predict_proba(t)*100,columns=df['Category'].unique())
      data=data.T
      data.columns=['Probability']
      data.index.name='Category'
      a=data.sort_values(['Probability'],ascending=False)
      a['Probability']=a['Probability'].apply(lambda x:round(x,2))
  except:
    print("Connection Timedout!")



Connection Timedout!
Connection Timedout!
Connection Timedout!
Connection Timedout!
Connection Timedout!


In [48]:
# opening the file in read mode 
my_file = open("scraped.txt", "r") 
  
# reading the file 
data = my_file.read() 
  
# replacing end splitting the text  
# when newline ('\n') is seen. 
scraped = data.split("\n") 
my_file.close() 

In [49]:
len(scraped)

69557

In [50]:
categories = {'tld': tlds, 'category': scraped}
categories_df = pd.DataFrame(categories)
categories_df.head()

Unnamed: 0,tld,category
0,ibytedtos.com,Connection Timedout!
1,apple.com,Business/Corporate
2,tiktokcdn-us.com,Connection Timedout!
3,garmin.com,E-Commerce
4,icloud.com,Social Networking and Messaging


In [51]:
categories_df['category'] = categories_df['category'].replace("Connection Timedout!", np.nan)
categories_df.head()

Unnamed: 0,tld,category
0,ibytedtos.com,
1,apple.com,Business/Corporate
2,tiktokcdn-us.com,
3,garmin.com,E-Commerce
4,icloud.com,Social Networking and Messaging


In [52]:
unique_category_values = categories_df['category'].unique()
print(unique_category_values)

[nan 'Business/Corporate' 'E-Commerce' 'Social Networking and Messaging'
 'Streaming Services' 'Computers and Technology' 'Sports' 'Photography'
 'Travel' 'News' 'Law and Government' 'Games' 'Health and Fitness'
 'Forums' 'Education' 'Food' 'Adult']


## Saving data
In this section, I am saving analyzed categories to incorporate them into the prediction model.


In [54]:
#save csv file to the google drive
categories_df.to_hdf('classified_tld.h5', 'data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['tld', 'category'], dtype='object')]

  categories_df.to_hdf('classified_tld.h5', 'data')


Then, I will aplly one-hot encoding for the websites categories. Also, I am applying labels 'applicable/non-applicable', which mentions if tlds are real and can be accessed.

In [57]:
encoded = pd.get_dummies(categories_df, columns = ['category'], dtype=int)

In [58]:
encoded.head()

Unnamed: 0,tld,category_Adult,category_Business/Corporate,category_Computers and Technology,category_E-Commerce,category_Education,category_Food,category_Forums,category_Games,category_Health and Fitness,category_Law and Government,category_News,category_Photography,category_Social Networking and Messaging,category_Sports,category_Streaming Services,category_Travel
0,ibytedtos.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,apple.com,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tiktokcdn-us.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,garmin.com,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,icloud.com,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [59]:
#save csv file to the google drive
encoded.to_hdf('encoded.h5', 'data')

## Merge datasets
In this section, I am merging one-hot encoded websites classsifications with general data

In [61]:
encoded_websites = pd.read_hdf('encoded.h5')

In [62]:
encoded_websites.head()

Unnamed: 0,tld,category_Adult,category_Business/Corporate,category_Computers and Technology,category_E-Commerce,category_Education,category_Food,category_Forums,category_Games,category_Health and Fitness,category_Law and Government,category_News,category_Photography,category_Social Networking and Messaging,category_Sports,category_Streaming Services,category_Travel
0,ibytedtos.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,apple.com,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tiktokcdn-us.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,garmin.com,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,icloud.com,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [63]:
clickstream_data = pd.read_hdf('click_df.h5')

In [64]:
clickstream_data.head()

Unnamed: 0,mdn_hash,clickDuration,tld,total_data,zipcode
0,f1d33b2f73242334,120,ibytedtos.com,4915,94901
1,eb5da755091e7756,1,apple.com,8821,94301
2,8316e5886a85ad58,134,tiktokcdn-us.com,1537,93314
3,1d2f9134899da04e,31,apple.com,7750,95670
4,ba3e39bb4b8a9091,0,apple.com,47814,92805


In [65]:
clickstreams_classified = clickstream_data.merge(encoded_websites, on='tld', how='left')

In [66]:
clickstreams_classified.head()

Unnamed: 0,mdn_hash,clickDuration,tld,total_data,zipcode,category_Adult,category_Business/Corporate,category_Computers and Technology,category_E-Commerce,category_Education,...,category_Forums,category_Games,category_Health and Fitness,category_Law and Government,category_News,category_Photography,category_Social Networking and Messaging,category_Sports,category_Streaming Services,category_Travel
0,f1d33b2f73242334,120,ibytedtos.com,4915,94901,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,eb5da755091e7756,1,apple.com,8821,94301,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8316e5886a85ad58,134,tiktokcdn-us.com,1537,93314,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1d2f9134899da04e,31,apple.com,7750,95670,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ba3e39bb4b8a9091,0,apple.com,47814,92805,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
clickstreams_classified.shape

(51546102, 21)

In [68]:
clickstream_data.shape

(51546102, 5)

In [None]:
#save csv file to the google drive
clickstreams_classified.to_hdf('clickstreams_classified.h5', 'data')