# Dataset preprocessing

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
from langdetect import detect
import re, string, unicodedata
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import datetime


Core version: 5.3.0
Pillow version: 5.1.0


### Convert the raw dataset to dataframe

In [2]:
## Read text file
filename = 'raw_content/text_content_web.txt'
with open(filename, 'r', encoding="utf-8") as myfile:
    data = myfile.read()
myfile.close()


## split into websites
document = data.split("\n\nNEW WEBSITE\n\n") 

##create list of dataframe columns
website = []
content = []
mcc = []

for doc in document:
    splitted = doc.split(",")
    if len(splitted) >= 3:
        website.append(splitted[0][1:])
        mcc.append(int(splitted[1]))
        content.append(" ".join(splitted[2:]))
        

In [3]:
docs = {'website': website ,'mcc': mcc,  'content': content}

documents = pd.DataFrame(docs, columns = ['website', 'mcc', 'content'])

#### Read the data file

In [4]:
documents = pd.read_csv("dataset_frame.csv")

## Cleaning the text content (sample dataset)
###### tokenization, empty string, lower case, special characters

In [4]:
def remove_non_ascii(words):

    """Remove non-ASCII characters from list of tokenized words"""

    new_words = []

    for word in words:

        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        new_words.append(new_word)

    return new_words


def remove_punctuation(words):

    """Remove punctuation from list of tokenized words"""

    new_words = []

    for word in words:

        new_word = re.sub(r'[^\w\s]', ' ', word)

        if new_word != '' and new_word != " ":

            new_words.append(new_word)

    return new_words



stop_words = stopwords.words('english')

def remove_stopwords(doc):

    """Remove stop words from list of tokenized words"""
    for word in stop_words:
      
        doc = doc.replace(" "+word+" ", ' ')
        
    return doc



def stem_words(words):

    """Stem words in list of tokenized words"""

    stemmer = LancasterStemmer()

    stems = []

    for word in words:

        stem = stemmer.stem(word)

        stems.append(stem)

    return stems



def lemmatize_verbs(words):

    """Lemmatize verbs in list of tokenized words"""

    lemmatizer = WordNetLemmatizer()

    lemmas = []

    for word in words:

        lemma = lemmatizer.lemmatize(word, pos='v')

        lemmas.append(lemma)

    return lemmas



In [5]:
def isLatin(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

    
def clean_data(content):
    content_str = str(content)   #convert content from object type to string
    words = word_tokenize(content_str)
    
    #remove digits, lowercase words
    words = [''.join([i.lower() for i in w if not i.isdigit()]) for w in words]
    
    #remove punctuation
    words = remove_punctuation(words)

    #remove empty strings
    while '' in words:
        words.remove('')
    
    #remove non ascii
    words = remove_non_ascii(words)
    
    #remove stop words   
    #words = [w for w in words if not w in stopwords.words('english')]
    
    #lemmatization
    words = lemmatize_verbs(words)

    doc = " ".join(w for w in words)
    
    #remove stop words
    doc = remove_stopwords(doc)
    
    print("balloon ", datetime.datetime.now() )
    
    return doc


#### Language filter

In [12]:
from langdetect import detect


def filter_en(content):
    try :
        if detect(content) == "en":
            return content
        else:
            return "nan"
    except:
        return "nan"
    
#documents["content"] = documents["content"].apply(filter_en)

#### Add language column to the dataset

In [6]:
taal = []

for row in documents.iterrows():
    try:
        taal.append(detect(row[1].content))
        #print(detect(row[1].content))
    except:
        taal.append("none")
        

#### Save the preprocessed data to file

In [10]:
#save to file
docs.to_csv("dataset_taal.csv", encoding='utf-8', index=False)

AttributeError: 'dict' object has no attribute 'to_csv'

In [3]:
docs = pd.read_csv("dataset_taal.csv")

### Add cluster column to dataframe

#### Read mcc2clusters

In [8]:
mcc2clusters = {}
with open("mcc2cluster") as f:
    lines = f.read().splitlines()
    for line in lines:
        token = line.split(":")
        
        mcc2clusters[int(token[0])] = token[1]

#### Add clusters to dataframe

In [9]:
clusters = []

for row in docs.iterrows():
    code = int(row[1].mcc)
    #print(code)
    try:
        cluster = mcc2clusters[code]
        clusters.append(cluster)
    except:
        #print(code, "unknown")
        clusters.append("unknown")

AttributeError: 'dict' object has no attribute 'iterrows'

In [38]:
#Add cluster column to dataset

df = pd.DataFrame(clusters)
df = df.rename(columns={0: "cluster"})
docs = pd.concat([docs, df], axis=1, sort=False)

### Filter english data

In [4]:
documents_en = docs[docs.taal == "en"]

In [5]:
documents_en

Unnamed: 0,website,mcc,content,taal
0,http://www.max-c-e.com,5967,Sorry your browser doesn't support JavaScript...,en
1,http://www.tradacasino.com,7995,Register Forgot password?\r\n\r\n * GAMES\r\...,en
2,http://bensbingo.com,7995,Login Forgot Password\r\n\r\nToggle navigation...,en
4,http://towa-esutenet.com,7298,Language : __\r\n\r\n * English\r\n * 日本語\r\...,en
5,http://www.mi-porn.com,5967,Login or Create an account.\r\n\r\nWish List \...,en
7,http://getdownxxx.com,5967,* Member Login\r\n\r\nX\r\n\r\nUsername\r\n\...,en
10,http://www.dezzel.com,7999,Skip to content\r\n\r\nDezzel\r\n\r\nGame Chan...,en
11,http://mycharitybingo.com,7995,### Error. Page cannot be displayed. Please co...,en
12,http://www.Resumenow.com,5817,**SUPPORT 7 DAYS A WEEK**\r\n\r\nPHONE EMAIL ...,en
13,http://www.redclouds.com,5967,* Explicit RedClouds\r\n * Free VoyeurWeb\r...,en


#### Apply clean_data to dataset

In [None]:
f = open("text_content_web2_processed.txt", "w", encoding="utf-8")
i = 0
for row in documents_en.iterrows():
    web = row[1].website
    content = clean_data(row[1].content)
    mcc = row[1].mcc
    cluster = row[1].cluster
    taal = row[1].taal
    print(web)
    f.write('%s, %s, %s, %s, %s\n' % (web, mcc, taal, cluster,content))


balloon  2018-12-18 12:08:57.382927
http://www.max-c-e.com
balloon  2018-12-18 12:08:57.450775
http://www.tradacasino.com
balloon  2018-12-18 12:08:57.486679
http://bensbingo.com
balloon  2018-12-18 12:08:57.736030
http://towa-esutenet.com
balloon  2018-12-18 12:09:04.949346
http://www.mi-porn.com
balloon  2018-12-18 12:09:04.999201
http://getdownxxx.com
balloon  2018-12-18 12:09:05.042087
http://www.dezzel.com
balloon  2018-12-18 12:09:05.043084
http://mycharitybingo.com
balloon  2018-12-18 12:09:05.051062
http://www.Resumenow.com
balloon  2018-12-18 12:09:05.149811
http://www.redclouds.com
balloon  2018-12-18 12:09:05.161780
http://www.oudessentials.com
balloon  2018-12-18 12:09:05.351302
http://www.secondlife.com
balloon  2018-12-18 12:09:05.360279
http://vigo-online.com
balloon  2018-12-18 12:09:05.426886
http://www.bookdrivingtestaustralia.com
balloon  2018-12-18 12:09:05.426886
http://getexcellenthealth.com
balloon  2018-12-18 12:09:05.426886
http://www.ecosway.com
balloon  2018-

balloon  2018-12-18 12:09:28.068308
http://www.globalexecutiveevents.com
balloon  2018-12-18 12:09:28.217094
http://www.stolenriches.com
balloon  2018-12-18 12:09:28.388409
http://www.snapfinancial.com
balloon  2018-12-18 12:09:28.388409
http://realfilthyfreaks.com
balloon  2018-12-18 12:09:28.404068
http://thefriendlypal.com
balloon  2018-12-18 12:09:28.404068
http://www.blackrodswhitebods.com
balloon  2018-12-18 12:09:28.404068
http://www.hardfuckhotel.com
balloon  2018-12-18 12:09:28.404068
http://vegasdays.com
balloon  2018-12-18 12:10:44.833858
http://www.router-switch.com
balloon  2018-12-18 12:10:45.188100
http://PokerShares.com
balloon  2018-12-18 12:10:45.188100
http://pickupanygirl.com
balloon  2018-12-18 12:10:45.425720
http://www.turn-page.com
balloon  2018-12-18 12:10:48.640078
http://hornywivescheatcams.com
balloon  2018-12-18 12:10:49.060389
https://www.tradovest.com
balloon  2018-12-18 12:10:49.096081
http://ellabingo.com
balloon  2018-12-18 12:10:49.111702
http://www.i

balloon  2018-12-18 12:11:37.678158
http://www.BestWebVault.com
balloon  2018-12-18 12:11:41.896643
http://www.maleimage-shop.co.uk
balloon  2018-12-18 12:11:42.150686
http://www.paywith.com
balloon  2018-12-18 12:11:42.419109
http://www.gsimarkets.com
balloon  2018-12-18 12:11:42.434726
http://www.sheinside.com
balloon  2018-12-18 12:11:42.450348
http://main.balletbingo.com
balloon  2018-12-18 12:11:42.460883
http://massagecreep.com
balloon  2018-12-18 12:11:42.561182
http://www.uxpin.com
balloon  2018-12-18 12:11:42.592460
http://kellybingo.com
balloon  2018-12-18 12:11:43.277518
http://www.hubfootwear.com
balloon  2018-12-18 12:11:43.293141
http://www.fhtpay.hk
balloon  2018-12-18 12:11:44.030508
http://www.pharmahorse.co.uk
balloon  2018-12-18 12:11:44.569800
http://www.PracticeWealth.com
balloon  2018-12-18 12:11:44.705828
http://gggdevot.com
balloon  2018-12-18 12:11:44.768301
http://pushplay.com
balloon  2018-12-18 12:11:44.864252
http://www.brazzers.com
balloon  2018-12-18 12:1

In [None]:
#documents_en["content"] = documents_en["content"].apply(clean_data)

In [22]:
c= documents_en.content[5]

In [16]:
words = remove_punctuation(words)

In [31]:
c = docs.content[5]

In [32]:
clean_data(c)

balloon  2018-12-18 12:07:48.317742


'login create account wish list currency shop cart item sex toy lingerie main categories popular categories men women couple lingerie dress essentials fetish fun game masturbators sex dolls lube dildos dong bondage clit stimulation vibrators anal toy sleeves extenders adult sex toy lingerie abi favourite adult sex toy abi favourite rabbit vibes abi favourite anal sex toy abi favourite male sex toy abi favourite vibrators abi favourite naughty lingerie abi must accessories abi favourite dildos dvd bluray main categories popular categories fantasy fetish sex act movie genre ethnicity nationality misc anal blow job lesbian orgy big boob cum shots milf parody black gangbang oral sex teen adult dvd bluray film bluray film recent release miporn top rat abi favourite adult movies miporn best sellers time best sellers avn winners abi favourite compilations abi watch porn movie classics vod pornstars z top star b c e f g h j k l n p q r u v w x z carter cruise keisha grey eva lovia riley reid j

In [29]:
remove_stopwords("i me the basak dark")

'i basak dark'

In [17]:
s="me the basak dark"

In [18]:
s.replace("me", "")

' the basak dark'