In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, roc_auc_score


import json
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize.punkt import PunktToken
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import lda
from sklearn.decomposition import TruncatedSVD


import warnings
warnings.filterwarnings('ignore')

In [2]:
def info_columns(data_):
    total_na = data_.isna().sum().sum()
    print("Dimensional:", data_.shape[0], "rows,", data_.shape[1], "columns")
    print("Total NA values: %d" %(total_na))
    print("%38s%10s%10s%10s" %("Column Name", "Data type", "#Distinct", "#NaN"))
    col_name = data_.columns
    dtype = data_.dtypes
    uniq = data_.nunique()
    
    for i in range(len(col_name)):
        print("%38s%10s%10s%10s" %(col_name[i], dtype[i], uniq[i], data_[col_name[i]].isna().sum()))

In [3]:
dataTrain = pd.read_csv('train.tsv', delimiter='\t',encoding='utf-8')
dataTest = pd.read_csv('test.tsv', delimiter='\t',encoding='utf-8')

dataAll = dataTrain.append(dataTest, ignore_index=True)
dataAll['Type'] = 'training'
dataAll['Type'][np.isnan(dataAll.label)] = 'testing'
dataAll

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,Type
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,24,0,5424,170,8,0.152941,0.079130,0.0,training
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,...,1,40,0,4973,187,9,0.181818,0.125448,1.0,training
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,1,55,0,2240,258,11,0.166667,0.057613,1.0,training
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,...,0,24,0,2737,120,5,0.041667,0.100858,1.0,training
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,...,1,14,0,12032,162,10,0.098765,0.082569,0.0,training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10561,http://busy-mommy.com/2012/02/peep-brownie-smo...,7264,"{""title"":""Peep Brownie S mores Busy Mommy An I...",?,?,1.666667,0.376623,0.129870,0.116883,0.090909,...,0,16,0,2772,77,3,0.012987,0.063401,,testing
10562,http://www.cannabissearch.com/edibles/cheesecake/,9714,"{""url"":""cannabissearch edibles cheesecake"",""ti...",?,?,1.305556,0.654321,0.123457,0.024691,0.000000,...,0,6,0,6058,81,2,0.333333,0.061995,,testing
10563,http://www.tastespotting.com/popular/views/all...,5903,"{""title"":""Most Viewed Submissions All Time mos...",?,?,0.717277,0.291667,0.182292,0.000000,0.000000,...,0,19,0,2876,192,4,0.177083,0.117647,,testing
10564,http://lifehacker.com/5839197/how-to-get-a-ful...,3176,"{""title"":""How to Get a Complete Workout with N...",sports,0.424304,0.940000,0.183333,0.066667,0.016667,0.016667,...,1,3,0,21029,180,12,0.333333,0.111966,,testing


In [4]:
info_columns(dataAll)

Dimensional: 10566 rows, 28 columns
Total NA values: 3171
                           Column Name Data type #Distinct      #NaN
                                   url    object     10566         0
                                 urlid     int64     10566         0
                           boilerplate    object     10565         0
                      alchemy_category    object        14         0
                alchemy_category_score    object      6776         0
                           avglinksize   float64      7634         0
                     commonlinkratio_1   float64      5731         0
                     commonlinkratio_2   float64      5119         0
                     commonlinkratio_3   float64      4118         0
                     commonlinkratio_4   float64      3347         0
                     compression_ratio   float64      9085         0
                           embed_ratio   float64       485         0
                            framebased     in

In [5]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def new_link_pre(df_train):
    linktrain1 = set(df_train[df_train['label'] == 1]['linkwordscore'].values)
    linktrain0 = set(df_train[df_train['label'] == 0]['linkwordscore'].values)
    link = list(linktrain1.intersection(linktrain0))
    
    val_count1 = []
    val_count0 = []
    for x in link:
        ah = df_train[df_train['linkwordscore'] == x]['label'].value_counts()
        val_count0.append(ah[0])
        val_count1.append(ah[1])
        
    for x in list(linktrain0 - linktrain1):
        link.append(x)
        val_count0.append(df_train[df_train['linkwordscore'] == x]['label'].value_counts()[0])
        val_count1.append(0)
    for x in list(linktrain1 - linktrain0):
        link.append(x)
        val_count1.append(df_train[df_train['linkwordscore'] == x]['label'].value_counts()[1])
        val_count0.append(0)
        
    rel = []
    for x in range(len(link)):
        prob_1 = val_count1[x] / (val_count0[x] + val_count1[x])
        if prob_1 <= 0.4:
            rel.append(sigmoid(-prob_1 - 2*(1-prob_1)))
            
        elif prob_1 >= 0.6:
            prob_1 = val_count1[x] / (val_count0[x] + val_count1[x]) 
            rel.append(sigmoid(prob_1*2 + (1-prob_1)))
            
        else:
            prob_1 = val_count1[x] / (val_count0[x] + val_count1[x])
            rel.append(sigmoid(2*(prob_1 - (1-prob_1))))
    
    linkpd = pd.DataFrame({'linkscore': link, 'label = 1': val_count1, 'label = 0': val_count0, 'result_prob': rel})

    relsprob = []
    for x in df_train['linkwordscore'].values:
        relsprob.append(linkpd[linkpd['linkscore'] == x]['result_prob'].values[0])
    
    return relsprob, linkpd

In [6]:
def lld_nol_pre(df_train):
    linktrain1 = set(df_train[df_train['lengthyLinkDomain'] == 1]['numberOfLinks'].values)
    linktrain0 = set(df_train[df_train['lengthyLinkDomain'] == 0]['numberOfLinks'].values)
    link = list(linktrain1.intersection(linktrain0))
    
    val_count1 = []
    val_count0 = []
    for x in link:
        ah = df_train[df_train['numberOfLinks'] == x]['lengthyLinkDomain'].value_counts()
        val_count0.append(ah[0])
        val_count1.append(ah[1])
        
    for x in list(linktrain0 - linktrain1):
        link.append(x)
        val_count0.append(df_train[df_train['numberOfLinks'] == x]['lengthyLinkDomain'].value_counts()[0])
        val_count1.append(0)
    for x in list(linktrain1 - linktrain0):
        link.append(x)
        val_count1.append(df_train[df_train['numberOfLinks'] == x]['lengthyLinkDomain'].value_counts()[1])
        val_count0.append(0)
        
    rel = []
    for x in range(len(link)):
        prob_1 = val_count1[x] / (val_count0[x] + val_count1[x])
        if prob_1 <= 0.4:
            rel.append(sigmoid(-prob_1 - 2*(1-prob_1)))
            
        elif prob_1 >= 0.6:
            prob_1 = val_count1[x] / (val_count0[x] + val_count1[x]) 
            rel.append(sigmoid(prob_1*2 + (1-prob_1)))
            
        else:
            prob_1 = val_count1[x] / (val_count0[x] + val_count1[x]) 
            rel.append(sigmoid(2*(prob_1 - (1-prob_1))))
    
    linkpd = pd.DataFrame({'numberOfLinks': link, 'lengthyLinkDomain = 1': val_count1, 'lengthyLinkDomain = 0': val_count0, 'result_prob': rel})

    relsprob = []
    for x in df_train['numberOfLinks'].values:
        relsprob.append(linkpd[linkpd['numberOfLinks'] == x]['result_prob'].values[0])
    
    return relsprob

In [7]:
from urllib.parse import urlparse

def getUrlDomain(url):
    parsed = urlparse(url)
    domain = re.sub('^www\.', '', parsed.netloc).split('.')
    result = domain[len(domain)-1]
    return result

def getName(url):
    parsed = urlparse(url)
    domain = re.sub('^www\.', '', parsed.netloc).split('.')
    result = domain[0]
    return result

def countPaths(url):
    parsed = urlparse(url)
    domain = re.findall('\/',parsed.path)
    return len(domain)

In [9]:
dataAll['url'].head(20)

0     http://www.bloomberg.com/news/2010-12-23/ibm-p...
1     http://www.popsci.com/technology/article/2012-...
2     http://www.menshealth.com/health/flu-fighting-...
3     http://www.dumblittleman.com/2007/12/10-foolpr...
4     http://bleacherreport.com/articles/1205138-the...
5     http://www.conveniencemedical.com/genital-herp...
6     http://gofashionlane.blogspot.tw/2012/06/ameri...
7     http://www.insidershealth.com/article/racing_f...
8     http://www.valetmag.com/the-handbook/features/...
9     http://www.howsweeteats.com/2010/03/24/cookies...
10                              http://www.reuters.com/
11    http://www.midwestsportsfans.com/2010/12/photo...
12    http://www.ivillage.com/our-dirty-mouths/4-b-1...
13    http://www.thedailygreen.com/print-this/health...
14    http://www.phillyburbs.com/blogs/type-a-kitche...
15    http://sportsillustrated.cnn.com/2012_swimsuit...
16    http://theawesomer.com/liquid-mountaineering/3...
17    http://www.guardian.co.uk/fashion/gallery/

In [18]:
shape = dataAll['url'].shape[0]
domain_1st, domain_2nd, domain_3rd = ['' for x in range(shape)], ['' for x in range(shape)], ['' for x in range(shape)]
domain_4th, domain_5th = ['' for x in range(shape)], ['' for x in range(shape)]
websitetype, websitetype2 = ['' for x in range(shape)], ['' for x in range(shape)]
for i, url in enumerate(dataAll['url']):
    parsed = urlparse(url)
    
    domainname = parsed.netloc.lower().split('.')
    if len(domainname) == 2: 
        domain_2nd[i] = domainname[0]
        domain_1st[i] = domainname[1]
    if len(domainname) == 3:
        domain_3rd[i], domain_2nd[i] = domainname[0], domainname[1]
        domain_1st[i] = domainname[2]
    if len(domainname) == 4:
        domain_4th[i], domain_3rd[i] = domainname[0], domainname[1]
        domain_2nd[i], domain_1st[i] = domainname[2], domainname[3]
    if len(domainname) == 5:
        domain_5th[i], domain_4th[i], domain_3rd[i] = domainname[0], domainname[1], domainname[2]
        domain_2nd[i], domain_1st[i] = domainname[3], domainname[4]
    
    pathresub = parsed.path
    punctuation = ['/', '-', '_', '\.', ':']
    for punc in punctuation:
        pathresub = re.sub(punc, ' ', pathresub)
    pathresub = pathresub.lower().split(' ')
    if len(pathresub) == 2:
        websitetype[i] = pathresub[1]
    else:
        websitetype[i], websitetype2[i] = pathresub[1], pathresub[2]

In [19]:
dataAll['news_front_page']

0        0
1        0
2        0
3        0
4        0
        ..
10561    0
10562    0
10563    0
10564    0
10565    0
Name: news_front_page, Length: 10566, dtype: object

In [28]:
dataAll['news_front_page'] = pd.to_numeric(dataAll['news_front_page'], errors= 'coerce')
dataAll['news_front_page'].fillna(-1, inplace= True)

In [33]:
(dataAll[dataAll['news_front_page'] == 0]['label'] == 1).sum()

3039

In [44]:
x = dataAll[dataAll['news_front_page'] == 0]['url']
y = dataAll[dataAll['news_front_page'] == 0]['label']
z = pd.DataFrame({'x':x, 'y':y})

In [45]:
z[y==1]['x']

1       http://www.popsci.com/technology/article/2012-...
2       http://www.menshealth.com/health/flu-fighting-...
3       http://www.dumblittleman.com/2007/12/10-foolpr...
6       http://gofashionlane.blogspot.tw/2012/06/ameri...
8       http://www.valetmag.com/the-handbook/features/...
                              ...                        
7382    http://www.allthingscupcake.com/2007/12/05/mou...
7383                    http://randomrr.com/80s-kids.html
7386    http://slice.seriouseats.com/archives/2010/06/...
7387    http://achicdirection.com/2012/01/23/vintage-f...
7392    http://eatthis.menshealth.com/slide/sweet-pota...
Name: x, Length: 3039, dtype: object

In [46]:
z[y==0]['x']

0       http://www.bloomberg.com/news/2010-12-23/ibm-p...
4       http://bleacherreport.com/articles/1205138-the...
10                                http://www.reuters.com/
11      http://www.midwestsportsfans.com/2010/12/photo...
14      http://www.phillyburbs.com/blogs/type-a-kitche...
                              ...                        
7377    http://www.forbes.com/sites/greatspeculations/...
7381                         http://www.caravanstyle.com/
7388    http://www.villagevoice.com/bestof/2009/award/...
7390    http://techcrunch.com/2010/09/08/kno-raises-46...
7391    http://www.uncoached.com/category/why-i-miss-c...
Name: x, Length: 2814, dtype: object

In [36]:
(dataAll[dataAll['news_front_page'] == 0]['label'] == 0).sum()

2814

In [34]:
(dataAll[dataAll['news_front_page'] == 1]['label'] == 1).sum()

138

In [37]:
(dataAll[dataAll['news_front_page'] == 1]['label'] == 0).sum()

156

In [35]:
(dataAll[dataAll['news_front_page'] == -1]['label'] == 1).sum()

619

In [38]:
(dataAll[dataAll['news_front_page'] == -1]['label'] == 0).sum()

629

In [5]:
def fill_new_front_page(data_):
    dic = {x: 0 for x in data_['website_type'].values}
    for x in dic.keys():
        if (data_[data_['news_front_page'] == 1]['website_type'].values == x).sum() > (data_[data_['news_front_page'] == 0]['website_type'].values == x).sum():
            dic[x] = 1
        elif (data_[data_['news_front_page'] == 1]['website_type'].values == x).sum() == (data_[data_['news_front_page'] == 0]['website_type'].values == x).sum():
            dic[x] = np.random.randint(2)
        else:
            dic[x] = 0
    
    for i in range(data_.shape[0]):
        if data_[i:i+1]['news_front_page'].values == -1:
            data_[i:i+1]['news_front_page'].replace(-1, dic.get(data_[i:i+1]['website_type'].values[0]), inplace= True)
    
    return data_

In [20]:
tes = pd.DataFrame({'domain_1st': domain_1st, 'domain_2nd':domain_2nd, 'domain_3rd':domain_3rd,
                   'domain_4th':domain_4th, 'domain_5th':domain_5th})
info_columns(tes)

Dimensional: 10566 rows, 5 columns
Total NA values: 0
                           Column Name Data type #Distinct      #NaN
                            domain_1st    object        54         0
                            domain_2nd    object      3733         0
                            domain_3rd    object       639         0
                            domain_4th    object        37         0
                            domain_5th    object         2         0


In [24]:
tes[tes['domain_4th'] != ''].head(60)

Unnamed: 0,domain_1st,domain_2nd,domain_3rd,domain_4th,domain_5th
17,uk,co,guardian,www,
21,uk,co,bbc,www,
36,com,nytimes,blogs,bitten,
40,uk,co,second-opinions,www,
48,uk,co,dailymail,www,
63,uk,co,independent,www,
66,uk,co,newsnow,www,
90,com,msn,ca,lifestyle,
95,uk,co,bbc,www,
115,uk,co,bbc,news,


In [15]:
dataAll['url'][1320]

'http://www.chatelaine.com/en/article/32328--four-ways-to-stop-a-cold-or-flu-in-its-tracks'

In [23]:
dataAll['url'][48
              ]

'http://www.dailymail.co.uk/news/article-2007972/300lbs-American-footballer-Danous-Estenor-lifts-car-save-life-trapped-truck-driver.html?ITO=1490'

In [21]:
parsed = urlparse('http://bitten.blogs.nytimes.com/2008/05/06/microwave-popcorn-minus-the-ripoff/')
domain = re.sub('^www\.', '', parsed.netloc).split('.')
parsed, domain

(ParseResult(scheme='http', netloc='bitten.blogs.nytimes.com', path='/2008/05/06/microwave-popcorn-minus-the-ripoff/', params='', query='', fragment=''),
 ['bitten', 'blogs', 'nytimes', 'com'])

In [25]:
parsed.netloc.split('.')

['bitten', 'blogs', 'nytimes', 'com']

In [15]:
re.sub('', '',parsed.path).split('.')

['20110606grilled-peaches']

In [52]:
parsed = urlparse('http://bitten.blogs.nytimes.com/')
parsed.path.split('/')

['', '']

In [84]:
shape = dataAll['url'].shape[0]
websitetype, websitetype2 = [-1 for x in range(shape)], [-1 for x in range(shape)]
for i, url in enumerate(dataAll['url']):
    parsed = urlparse(url)
    pathresub = parsed.path
    punctuation = ['/', '-', '_', '\.', ':']
    for punc in punctuation:
        pathresub = re.sub(punc, ' ', pathresub)
        
    pathresub = pathresub.lower().split(' ')
    if len(pathresub) == 2:
        websitetype[i] = pathresub[1]
    else:
        websitetype[i], websitetype2[i] = pathresub[1], pathresub[2]


In [108]:
shape = dataAll['url'].shape[0]
subdomain, webname = ['' for x in range(shape)], ['' for x in range(shape)]
botdomain, topdomain = ['' for x in range(shape)], ['' for x in range(shape)]
websitetype, websitetype2 = ['' for x in range(shape)], ['' for x in range(shape)]
for i, url in enumerate(dataAll['url']):
    parsed = urlparse(url)
    
    domainname = parsed.netloc.lower().split('.')
    if len(domainname) == 2: 
        webname[i] = domainname[0]
        topdomain[i] = domainname[1]
    if len(domainname) == 3:
        subdomain[i], webname[i] = domainname[0], domainname[1]
        topdomain[i] = domainname[2]
    if len(domainname) == 4:
        subdomain[i], webname[i] = domainname[0], domainname[1]
        botdomain[i], topdomain[i] = domainname[2], domainname[3]
    if len(domainname) == 5:
        subdomain[i], webname[i] = domainname[0], domainname[1]
        botdomain[i], topdomain[i] = domainname[3], domainname[4]
    
    pathresub = parsed.path
    punctuation = ['/', '-', '_', '\.', ':']
    for punc in punctuation:
        pathresub = re.sub(punc, ' ', pathresub)
    pathresub = pathresub.lower().split(' ')
    if len(pathresub) == 2:
        websitetype[i] = pathresub[1]
    else:
        websitetype[i], websitetype2[i] = pathresub[1], pathresub[2]

In [109]:
websitetype2

['2010',
 'article',
 'flu',
 '12',
 '1205138',
 'herpes',
 '06',
 'racing',
 'handbook',
 '03',
 '',
 '12',
 'dirty',
 'this',
 'type',
 'swimsuit',
 'mountaineering',
 'gallery',
 '06',
 'degrunge',
 '12',
 'recipes',
 'php',
 '08',
 'kitchen',
 '10',
 '',
 '2008',
 'supermodels',
 'swimsuit',
 'bagels',
 'wheat',
 '01',
 'asian',
 'holidays',
 '',
 '05',
 '1806201',
 'vegetarian',
 'health',
 'html',
 '',
 'nail',
 'cholesterol',
 'swimsuit',
 'news',
 '',
 'apple',
 'article',
 'hubfc',
 'tunk',
 'recipes',
 '',
 '02',
 'health',
 '03',
 '03',
 'betting',
 '',
 '',
 'cups',
 'afternoon',
 '20',
 'style',
 'guide',
 '10',
 '',
 '01',
 '1163450',
 'recipe',
 '149193',
 'and',
 'disorders',
 'clean',
 '03',
 '2006',
 '07',
 'kitchen',
 '',
 'fitness',
 '',
 'cheese',
 '07',
 '02',
 'news',
 '',
 'show',
 '10',
 '',
 'foodie',
 'fitness',
 'fitness',
 '',
 '2008',
 'stuff',
 'recipes',
 '01',
 '',
 '2007',
 '1248591',
 '1479',
 'rosh',
 'combo',
 'common',
 '06',
 'chicken',
 'on',
 'h

In [111]:
tes = ['websitetype', 'websitetype2', 'subdomain', 'webname']
for i, name in enumerate(websitetype2):
    if '<' in name:
        print(name)

In [10]:
def featurizes(data_):
    text = data_['url'].str.split('//',n = -1, expand = True)[1]

    text2 = []
    text3 = []
    for x in text:
        spli = x.split('/')
        text2.append(spli[0])
        text3.append(x.split(spli[0] + '/')[1])

    subdomain = []
    for x in text2:
        spli = x.split('.')
        if len(spli) == 2:
            subdomain.append('')
        if len(spli) == 3:
            subdomain.append(spli[0])
        if len(spli) == 4:
            subdomain.append(spli[0]) 
        if len(spli) > 4:
            subdomain.append(spli[0])
            
    domain = []
    webname = []
    path = []
    for url in data_['url'].values:
        domain.append(getUrlDomain(url))
        webname.append(getName(url))
        path.append(countPaths(url))
    
    text4 = []
    text5 = []
    for x in text3:
        spli = x.split('/')
        if len(spli) == 1:
            text4.append(spli[0])
            text5.append('')
        else:
            text4.append(spli[0])
            text5.append(spli[1])   

    punctuation = ['.', '?', '!', '=', '-', '_', '%', ':']
    for i in range(len(text4)):
        for j in punctuation:
            text4[i] = text4[i].replace(j, ' ')
            text5[i] = text5[i].replace(j, ' ')

    website_type = []
    website_type2 = [-1 for i in range(len(text5))]
    for i, x in enumerate(text4):
        spli = x.split(' ')
        if len(spli) == 1:
            website_type.append(spli[0])
        else:
            website_type.append(spli[0])
            website_type2[i] = spli[1]

    for i, x in enumerate(text5):
        if x == '':
            website_type2[i] = ''
        if website_type2[i] == -1:
            website_type2[i] = x.split(' ')[0]  
    
#     rels_prob, rels_quan = new_link_pre(data_)
    
    # Features from URL
    data_['subdomain'] = subdomain
    data_['webname'] = webname
    data_['domain'] = domain
    data_['countPath'] = path
    data_['website_type'] = website_type
    data_['website_type'] = data_['website_type'].replace({'2006':'YEAR', '2007':'YEAR', '2008':'YEAR',
                                '2009':'YEAR','2010':'YEAR', '2011':'YEAR', '2012':'YEAR','2013':'YEAR',
                            '01':'MONTH', '02':'MONTH','03':'MONTH','04':'MONTH','05':'MONTH','06':'MONTH',
                            '07':'MONTH','08':'MONTH','09':'MONTH','10':'MONTH','11':'MONTH','12':'MONTH'})
    
    data_['website_type2'] = website_type2
    data_['website_type2'] = data_['website_type2'].replace({'2006':'YEAR', '2007':'YEAR', '2008':'YEAR', 
                              '2009':'YEAR', '2010':'YEAR','2011':'YEAR', '2012':'YEAR','2013':'YEAR',
                            '01':'MONTH', '02':'MONTH','03':'MONTH','04':'MONTH','05':'MONTH','06':'MONTH',
                            '07':'MONTH','08':'MONTH','09':'MONTH','10':'MONTH','11':'MONTH','12':'MONTH',})
    
#     rels_prob = new_link_pre(data_)
#     data_['linkscorepredprob'] = rels_prob
#     data_['linkscorepredquan'] = rels_quan

    data_['wordInHLT'] = data_['non_markup_alphanum_characters'] * data_['linkwordscore'] / 100
    data_['wordError'] = data_['non_markup_alphanum_characters'] * data_['spelling_errors_ratio']
    
    lld = lld_nol_pre(data_)
    data_['lld_nol_prob'] = lld
    
    # Other Features
    data_['alchemy_category_score'] = pd.to_numeric(data_['alchemy_category_score'], errors= 'coerce')
    data_['alchemy_category_score'].fillna(data_['alchemy_category_score'].mean(), inplace= True)
    
#     data_['is_news'] = pd.to_numeric(data_['is_news'], errors= 'coerce')
#     data_['is_news'].fillna(0, inplace= True)
    
    data_['news_front_page'] = pd.to_numeric(data_['news_front_page'], errors= 'coerce')
    data_['news_front_page'].fillna(-1, inplace= True)
    
    data_.drop(columns= ['framebased','url', 'urlid', 'boilerplate', 'is_news'], inplace= True)
    
    # Encoder
    le = LabelEncoder()
    fea = ['alchemy_category', 'subdomain', 'webname', 'domain', 'website_type', 'website_type2']
    for x in fea:
        le.fit(data_[x])
        data_[x] = le.transform(data_[x])
    
    data_ = fill_new_front_page(data_)
    return data_

In [11]:
newData_train, newData_test = dataTrain.copy(), dataTest.copy()
newData_train = featurizes(newData_train)
newData_test = featurizes(newData_test)

In [12]:
rels_prob, linkpd = new_link_pre(newData_train)
newData_train['linkscorepredprob'] = rels_prob

prob = []
for x in newData_test['linkwordscore'].values:
    prob.append(linkpd[linkpd['linkscore'] == x]['result_prob'].values[0])
    
newData_test['linkscorepredprob'] = prob

In [13]:
info_columns(newData_train)

Dimensional: 7395 rows, 32 columns
Total NA values: 0
                           Column Name Data type #Distinct      #NaN
                      alchemy_category     int64        14         0
                alchemy_category_score   float64      4806         0
                           avglinksize   float64      5710         0
                     commonlinkratio_1   float64      4476         0
                     commonlinkratio_2   float64      4038         0
                     commonlinkratio_3   float64      3266         0
                     commonlinkratio_4   float64      2695         0
                     compression_ratio   float64      6453         0
                           embed_ratio   float64       366         0
                         frameTagRatio   float64      5911         0
                         hasDomainLink     int64         2         0
                            html_ratio   float64      7376         0
                           image_ratio   float64 

In [14]:
info_columns(newData_test)

Dimensional: 3171 rows, 31 columns
Total NA values: 0
                           Column Name Data type #Distinct      #NaN
                      alchemy_category     int64        12         0
                alchemy_category_score   float64      2036         0
                           avglinksize   float64      2656         0
                     commonlinkratio_1   float64      2228         0
                     commonlinkratio_2   float64      2040         0
                     commonlinkratio_3   float64      1725         0
                     commonlinkratio_4   float64      1381         0
                     compression_ratio   float64      2798         0
                           embed_ratio   float64       172         0
                         frameTagRatio   float64      2784         0
                         hasDomainLink     int64         2         0
                            html_ratio   float64      3165         0
                           image_ratio   float64 

In [65]:
def getColumnsFromBoilerplate(data_):
    boilerplate = data_['boilerplate']

    dicts = {
        'title' : [],
        'body' : [],
        'url' : []
    }
    
    for i in range(len(boilerplate)):
        temp = json.loads(boilerplate[i])
        for j in dicts.keys():
            dicts[j].append(temp.get(j, None))

    boilerplate_df = pd.DataFrame()
    boilerplate_df['title'] = dicts['title']
    boilerplate_df['body'] = dicts['body']
    
    return boilerplate_df

In [66]:
boilerplateTrain_df = getColumnsFromBoilerplate(dataTrain)
boilerplateTest_df = getColumnsFromBoilerplate(dataTest)

In [67]:
info_columns(boilerplateTrain_df)

Dimensional: 7395 rows, 2 columns
Total NA values: 70
                           Column Name Data type #Distinct      #NaN
                                 title    object      6834        13
                                  body    object      6616        57


In [68]:
info_columns(boilerplateTest_df)

Dimensional: 3171 rows, 2 columns
Total NA values: 25
                           Column Name Data type #Distinct      #NaN
                                 title    object      2951         5
                                  body    object      2833        20


In [71]:
def fillNanAndNone(data_):
    fea = ['title', 'body']
    for name in fea:
        data_[name].fillna(" ", inplace= True)
    return data_

boilerplateTrain_df = fillNanAndNone(boilerplateTrain_df)
boilerplateTest_df = fillNanAndNone(boilerplateTest_df)

In [72]:
info_columns(boilerplateTrain_df)

Dimensional: 7395 rows, 2 columns
Total NA values: 0
                           Column Name Data type #Distinct      #NaN
                                 title    object      6834         0
                                  body    object      6616         0


In [73]:
info_columns(boilerplateTest_df)

Dimensional: 3171 rows, 2 columns
Total NA values: 0
                           Column Name Data type #Distinct      #NaN
                                 title    object      2951         0
                                  body    object      2833         0


In [92]:
stop_words = stopwords.words('english')
stop_word_add = ['becau','abov', 'ani', 'becaus', 'befor', 'doe', 'dure', 'ha', 'hi', 'onc', 'onli', 'ourselv', 'themselv', 'thi', 'veri', 'wa', 'whi', 'yourselv']
for i in stop_word_add:
    stop_words.append(i)
    
def preprocessor(text):
    text = re.sub('[\W0-9]+', ' ', text.lower())
    return text

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

def tfidf(data_train, data_test):
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', sublinear_tf=True,
                token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf= True, smooth_idf= True, binary=False,
                stop_words= stop_words, tokenizer= tokenizer_porter, preprocessor= preprocessor)
#     tfv = TfidfVectorizer(stop_words= stop_words, tokenizer= tokenizer_porter, preprocessor= preprocessor)
    tfv.fit(data_train['body'])
    tfv.fit(data_test['body'])
    bodyVectTrain = tfv.transform(data_train['body'])
    bodyVectTest = tfv.transform(data_test['body'])
    
    tfv.fit(data_train['title'])
    tfv.fit(data_test['title'])
    titleVectTrain = tfv.transform(data_train['title'])
    titleVectTest = tfv.transform(data_test['title'])
    
    return bodyVectTrain, bodyVectTest, titleVectTrain, titleVectTest

In [93]:
bodyVectTrain, bodyVectTest, titleVectTrain, titleVectTest = tfidf(boilerplateTrain_df, boilerplateTest_df)

In [76]:
def SVD(bodyVect, titleVect):
    titleDecomposed = TruncatedSVD(random_state = 123, n_components = 10).fit_transform(titleVect)
    bodyDecomposed = TruncatedSVD(random_state = 123, n_components = 50).fit_transform(bodyVect)    
    textDecomposed = np.hstack([titleDecomposed,bodyDecomposed])
    return textDecomposed

In [94]:
textTrainDecomposed = SVD(bodyVectTrain, titleVectTrain)
textTestDecomposed = SVD(bodyVectTest, titleVectTest)

In [95]:
y_train = dataTrain.label
newData_train.drop(columns= ['label'], inplace= True)
X_train = np.hstack([np.array(newData_train), textTrainDecomposed])
X_test = np.hstack([np.array(newData_test), textTestDecomposed])

X_train.shape, y_train.shape, X_test.shape

((7395, 91), (7395,), (3171, 91))

In [79]:
base_classifiers = [
                    ('Random Forest', RandomForestClassifier(n_estimators=1000, n_jobs= 4, min_samples_leaf= 2, bootstrap= True, min_samples_split= 5, max_features= .1, max_samples= 30)),
                    ('AdaBoost Classifier', AdaBoostClassifier()),
                    ('Gradient Boosting Classifier', GradientBoostingClassifier(n_estimators= 300, learning_rate= 0.08, max_features= 4, min_samples_leaf= 2, min_samples_split= 5, max_depth= 7)),
                    ('Extra Trees Classifier', ExtraTreesClassifier()),
                    ('LightGBM', LGBMClassifier(n_estimators= 300, learning_rate= 0.08, num_leaves= 50,boosting_type= 'gbdt', objective= 'binary', sub_feature= 0.5, max_depth= 7)),
                    ('XGBM', XGBClassifier())]

# ('KNN', KNeighborsClassifier()),
# ('Naive Bayes', GaussianNB()),
# ('Decision Tree', DecisionTreeClassifier()),

In [54]:
def voting_prob(lis):
    return np.mean(lis)

In [96]:
pred_prob = []
for x in range(3):
    clf = StackingClassifier(estimators= base_classifiers, final_estimator= LogisticRegression(random_state= 102))
    clf.fit(X_train.astype(np.float), y_train)
    pred_prob.append(clf.predict_proba(X_test.astype(np.float))[:, 1])



In [97]:
lis = [[] for i in range(dataTest.shape[0])]
for i in range(dataTest.shape[0]):
    for j in range(len(pred_prob)):
            lis[i].append(pred_prob[j][i])
            
rel = []
for i in range(dataTest.shape[0]):
    rel.append(voting_prob(lis[i]))

In [98]:
rel

[0.7858079618435418,
 0.5480799043076902,
 0.37053804734853535,
 0.81230268800668,
 0.7678207126593172,
 0.8227732121994903,
 0.7360314937831011,
 0.6502747071915529,
 0.7096401072046428,
 0.5452134958018946,
 0.5483212627091808,
 0.4609657495265255,
 0.6930605814871167,
 0.5382285372628024,
 0.8040397688276837,
 0.742836061266216,
 0.5875139077028276,
 0.7917041167085719,
 0.8051393171891664,
 0.7232876327035909,
 0.5940635598918864,
 0.7191397180171686,
 0.6520093019263342,
 0.849367059111696,
 0.6280446366312061,
 0.5802872858270425,
 0.8175198945710099,
 0.7085476678078884,
 0.7629049874949446,
 0.8219473045835247,
 0.6899605490564769,
 0.8008914531897351,
 0.5876861047094505,
 0.6908083250493625,
 0.7885723697327434,
 0.7455125541657822,
 0.8126118779833508,
 0.8995459148901249,
 0.7342824296874331,
 0.6536356510023547,
 0.6352893811798116,
 0.7380212637441667,
 0.32666066603937466,
 0.7580280522072801,
 0.8041013897825963,
 0.8694886003245094,
 0.8143518561392774,
 0.832750121069

In [99]:
xx = pd.DataFrame({'urlid': dataTest['urlid'],'label': rel})
compression_opts = dict(method='zip',
                        archive_name='out123.csv')  
xx.to_csv('he.zip', index=False,
          compression=compression_opts)  