## Preprocess raw web content

In [12]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import nltk
from langdetect import detect
import re, string, unicodedata
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import datetime


#### Cleanin text content

In [13]:
def remove_non_ascii(words):

    """Remove non-ASCII characters from list of tokenized words"""

    new_words = []

    for word in words:

        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        new_words.append(new_word)

    return new_words


def remove_punctuation(words):

    """Remove punctuation from list of tokenized words"""

    new_words = []

    for word in words:

        new_word = re.sub(r'[^\w\s]', ' ', word)

        if new_word != '' and new_word != " ":

            new_words.append(new_word)

    return new_words



stop_words = stopwords.words('english')

def remove_stopwords(doc):

    """Remove stop words from list of tokenized words"""
    for word in stop_words:
      
        doc = doc.replace(" "+word+" ", ' ')
        
    return doc



def stem_words(words):

    """Stem words in list of tokenized words"""

    stemmer = LancasterStemmer()

    stems = []

    for word in words:

        stem = stemmer.stem(word)

        stems.append(stem)

    return stems



def lemmatize_verbs(words):

    """Lemmatize verbs in list of tokenized words"""

    lemmatizer = WordNetLemmatizer()

    lemmas = []

    for word in words:

        lemma = lemmatizer.lemmatize(word, pos='v')

        lemmas.append(lemma)

    return lemmas

def isLatin(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

    
def clean_data(content):
    content_str = str(content)   #convert content from object type to string
    words = word_tokenize(content_str)
    
    print("tokenized")
    
    #remove digits, lowercase words
    words = [''.join([i.lower() for i in w if not i.isdigit()]) for w in words]
    
    #remove punctuation
    words = remove_punctuation(words)
    print("punctuation")
    
    #remove empty strings
    while '' in words:
        words.remove('')
    
    #remove non ascii
    words = remove_non_ascii(words)
    print("ascii")
    
    #lemmatization
    words = lemmatize_verbs(words)
    print("lemma")
    doc = " ".join(w for w in words)
    
    #remove stop words
    doc = remove_stopwords(doc)
    print("stopwords")
    #remove special characters
    doc = re.sub('[^A-Za-z0-9]+', ' ', doc)
    
    #print("balloon ", datetime.datetime.now() )
    
    return doc


### Read mcc2clusters

In [14]:
mcc2clusters = {}
with open("mcc2cluster") as f:
    lines = f.read().splitlines()
    for line in lines:
        token = line.split(":")
        
        mcc2clusters[int(token[0])] = token[1]

In [15]:
mcc2clusters[5941]

'marketplace1'

## read raw content & process

In [16]:
def process_raw_content(fname, i):   
    ## Read text file
    #fname = 'raw_content/text_content_web'
    filename = fname + i + ".txt"

    with open(filename, 'r', encoding="utf-8") as myfile:
        data = myfile.read()
    myfile.close()


    ## split into websites
    document = data.split("\n\nNEW WEBSITE\n\n") 

    ##create list of dataframe columns
    website = []
    content = []
    mcc = []

    for doc in document:
        splitted = doc.split(",")
        if len(splitted) >= 3:
            website.append(splitted[0][1:])
            mcc.append(int(splitted[1]))
            content.append(" ".join(splitted[2:]))

    docs = {'website': website ,'mcc': mcc,  'content': content}

    documents = pd.DataFrame(docs, columns = ['website', 'mcc', 'content'])

    print("Dataframe created")

    ## Add Language Column

    taal = []

    for row in documents.iterrows():
        try:
            taal.append(detect(row[1].content))
            #print(detect(row[1].content))
        except:
            taal.append("none")

    df = pd.DataFrame(taal)
    df = df.rename(columns={0: "taal"})
    docs = pd.concat([documents, df], axis=1, sort=False)

    print("Language column added")

    clusters = []

    for row in docs.iterrows():
        code = int(row[1].mcc)
        #print(code)
        try:
            cluster = mcc2clusters[code]
            clusters.append(cluster)
        except:
            #print(code, "unknown")
            clusters.append("unknown")

    #Add cluster column to dataset

    df = pd.DataFrame(clusters)
    df = df.rename(columns={0: "cluster"})
    docs = pd.concat([docs, df], axis=1, sort=False)

    print("Cluster column added")


    documents_en = docs[docs.taal == "en"]
    #return documents_en
    print("english websites: ", len(documents_en))
    
    
    f = open(fname+i+"processed.txt", "w", encoding="utf-8")
    j = 0
    for row in documents_en.iterrows():
        web = row[1].website
        content = clean_data(row[1].content)
        mcc = row[1].mcc
        #cluster = row[1].cluster
        taal = row[1].taal
        f.write('%s, %s, %s, %s\n' % (web, mcc, taal,content))
        print(web, j)
        j+=1
        
    f.close()
    
    

In [43]:
process_raw_content("raw_content/silver_data/attraction-content", "")

Dataframe created
Language column added
Cluster column added
english websites:  37
tokenized
punctuation
ascii
lemma
stopwords
https://boatamsterdam.com/en/ 0
tokenized
punctuation
ascii
lemma
stopwords
https://www.lindbergh.nl/ 1
tokenized
punctuation
ascii
lemma
stopwords
https://www.feyenoord.com/tickets/tour-tickets 2
tokenized
punctuation
ascii
lemma
stopwords
https://tickets.heinekenexperience.com/ 3
tokenized
punctuation
ascii
lemma
stopwords
https://www.johancruijffarena.nl/stadiumtour/tour/visitors-information.htm 4
tokenized
punctuation
ascii
lemma
stopwords
https://tickets.holland.com/en/tours/anne-frank-tour/ 5
tokenized
punctuation
ascii
lemma
stopwords
https://amsterdamlightfestival.com/en/book-a-tour 6
tokenized
punctuation
ascii
lemma
stopwords
https://www.splashtours.nl/en/tickets-prices/ 7
tokenized
punctuation
ascii
lemma
stopwords
https://www.bigbustours.com/en/sydney/sydney-tour-tickets-and-passes/sydney-bus-tours/ 8
tokenized
punctuation
ascii
lemma
stopwords
http

In [13]:
documents_en = documents_en.reset_index()

In [15]:

f = open("raw_content/text_content_web"+"8"+"processed_2.txt", "w", encoding="utf-8")
j = 0
for row in documents_en.iterrows():
    if row[0] > 42:
        web = row[1].website
        content = clean_data(row[1].content)
        mcc = row[1].mcc
        #cluster = row[1].cluster
        taal = row[1].taal
        f.write('%s, %s, %s, %s\n' % (web, mcc, taal,content))
        print(web, j)
        j+=1
f.close()

tokenized
punctuation
ascii
lemma
stopwords
http://www.newchic.com 0
tokenized
punctuation
ascii
lemma
stopwords
http://www.taylorgowns.de.com 1
tokenized
punctuation
ascii
lemma
stopwords
http://www.vmx-racing.com 2
tokenized
punctuation
ascii
lemma
stopwords
http://www.reelhd.com 3
tokenized
punctuation
ascii
lemma
stopwords
http://www.airyexpo.com 4
tokenized
punctuation
ascii
lemma
stopwords
http://www.faerydress.com 5
tokenized
punctuation
ascii
lemma
stopwords
http://www.privatecheatz.com 6
tokenized
punctuation
ascii
lemma
stopwords
http://www.auhl.com.hk 7
tokenized
punctuation
ascii
lemma
stopwords
http://www.singpet.com 8
tokenized
punctuation
ascii
lemma
stopwords
http://www.lightinthebox.com 9
tokenized
punctuation
ascii
lemma
stopwords
http://www.dressonlinemall.com 10
tokenized
punctuation
ascii
lemma
stopwords
http://www.digimatchy.com 11
tokenized
punctuation
ascii
lemma
stopwords
http://www.pomeet.top 12
tokenized
punctuation
ascii
lemma
stopwords
http://www.fantasyhai

In [None]:
for row in r.iterrows():
    if row[0] >= 85:
        web = row[1].website
        content = clean_data(row[1].content)
        mcc = row[1].mcc
        #cluster = row[1].cluster
        taal = row[1].taal
        print(web)
        

tokenized
punctuation
ascii
lemma
stopwords
http://www.slh.com


#### Reading preprocessed data

In [8]:
def read_processed(fname, i):
    filename = fname + i + "processed.txt"
    
    with open(filename, 'r', encoding="utf-8") as myfile:
        lines = myfile.readlines()
    myfile.close()
    
    website = []
    mcc = []
    #cluster = []
    taal = []
    content = []
    
    #print(lines[1].split(", ")[4])
    
    for line in lines:
        splitted = line.split(", ")
        website.append(splitted[0])
        mcc.append(splitted[1])
        taal.append(splitted[2])
        #cluster.append(splitted[3])
        content.append(" ".join(splitted[3:]))
    
    docs = {'website': website ,'mcc': mcc,  "taal" : taal, 'content': content}

    documents = pd.DataFrame(docs, columns = ['website', 'mcc', "taal", 'content'])
    
    return documents



In [20]:
# d = read_processed("raw_content/text_content_web", "")
# print("d")
# d2 = read_processed("raw_content/text_content_web", "2")
# d2 = d.append(d2, ignore_index=True)
# print("d2")
# d3 = read_processed("raw_content/text_content_web", "3")
# print("d3")
# d3 = d2.append(d3, ignore_index=True)
# d4 = read_processed("raw_content/text_content_web", "4")
# print("d4")
# d4 = d3.append(d4, ignore_index=True)
# d5 = read_processed("raw_content/text_content_web", "5")
# print("d5")
# d5 = d4.append(d5, ignore_index=True)
# d6 = read_processed("raw_content/text_content_web", "6")
# print("d6")
# d6 = d5.append(d6, ignore_index=True)
# d7 = read_processed("raw_content/text_content_web", "7")
# print("d7")
# d7 = d6.append(d7, ignore_index=True)
# d8 = read_processed("raw_content/text_content_web", "8")
# print("d8")
# d8 = d7.append(d8, ignore_index=True)


In [48]:
d = read_processed("raw_content/silver_data/attraction-content", "")
d

Unnamed: 0,website,mcc,taal,content
0,https://boatamsterdam.com/en/,7991,en,houseboats canal cruise eng nl book find us bo...
1,https://www.lindbergh.nl/,7991,en,lindbergh tour operators group contact holland...
2,https://www.feyenoord.com/tickets/tour-tickets,7991,en,x fanshop naar feyenoord nl menu fanshop naar ...
3,https://tickets.heinekenexperience.com/,7991,en,you use old browser please update browser view...
4,https://www.johancruijffarena.nl/stadiumtour/t...,7991,en,en arena mobility portal inside arena login ti...
5,https://tickets.holland.com/en/tours/anne-fran...,7991,en,search exact match search title search conten...
6,https://amsterdamlightfestival.com/en/book-a-tour,7991,en,festival artworks stories organisation call co...
7,https://www.splashtours.nl/en/tickets-prices/,7991,en,nl en de search search would rather speak some...
8,https://www.bigbustours.com/en/sydney/sydney-t...,7991,en,big bus tours ticket select day edit total au...
9,https://irishtourtickets.com/,7991,en,giants causeway tour game throne tour tour en ...


In [38]:
media = pd.read_csv("media.csv") 

In [49]:
clusters = []
for row in d.iterrows():
    code = int(row[1].mcc)
    #print(code)
    try:
        cluster = mcc2clusters[code]
        clusters.append(cluster)
    except:
        print(code, "unknown")
        clusters.append("unknown")

#Add cluster column to dataset

df = pd.DataFrame(clusters)
df = df.rename(columns={0: "cluster"})
docs = pd.concat([d, df], axis=1, sort=False)

print("Cluster column added")


Cluster column added


In [50]:
docs

Unnamed: 0,website,mcc,taal,content,cluster
0,https://boatamsterdam.com/en/,7991,en,houseboats canal cruise eng nl book find us bo...,attraction
1,https://www.lindbergh.nl/,7991,en,lindbergh tour operators group contact holland...,attraction
2,https://www.feyenoord.com/tickets/tour-tickets,7991,en,x fanshop naar feyenoord nl menu fanshop naar ...,attraction
3,https://tickets.heinekenexperience.com/,7991,en,you use old browser please update browser view...,attraction
4,https://www.johancruijffarena.nl/stadiumtour/t...,7991,en,en arena mobility portal inside arena login ti...,attraction
5,https://tickets.holland.com/en/tours/anne-fran...,7991,en,search exact match search title search conten...,attraction
6,https://amsterdamlightfestival.com/en/book-a-tour,7991,en,festival artworks stories organisation call co...,attraction
7,https://www.splashtours.nl/en/tickets-prices/,7991,en,nl en de search search would rather speak some...,attraction
8,https://www.bigbustours.com/en/sydney/sydney-t...,7991,en,big bus tours ticket select day edit total au...,attraction
9,https://irishtourtickets.com/,7991,en,giants causeway tour game throne tour tour en ...,attraction


In [51]:
docs.to_csv("attraction_processed.csv", encoding='utf-8', index=False)

In [42]:
docs

Unnamed: 0,website,mcc,taal,content,cluster
0,https://365playing.com,5815,en,play play home register support login unique g...,games
1,https://411playz.com,5815,en,toggle navigation question email protected log...,games
2,https://777streams.com,5815,en,home popular price support login signup take ...,entertainment
3,https://agamemix.com,5815,en,simple interface interface easy use anyone na...,games
4,https://amazingbundle.com,5815,en,amazingbundle home support login join home sup...,entertainment
5,https://amusicheaven.com,5815,en,please enable js home popular support privacy ...,entertainment
6,https://backupflix.com,5815,en,backupflix flix home music game movies registe...,entertainment
7,https://beatshd.com,5815,en,we open hours day days week contact us telepho...,entertainment
8,https://bestgamesfun.com,5815,en,bestgamesfun home support signup login home su...,games
9,https://bestwebfun.com,5815,en,please enable js we open hours day days week e...,entertainment
