In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from random import shuffle
import zipfile
from sklearn.model_selection import train_test_split
import pickle

pd.set_option('display.max_colwidth', -1)

In [2]:
train=pd.read_csv('train.csv')

In [3]:
def preprocess(url,title):
    
    pattern=r'//.*'
    urls=[]
    for i in url:
        b=re.findall(pattern,i)
        c=b[0][2:]
        d=re.split('\.|/|-|_',c)
        e=''
        for i in d:
            e+=str(i).lower()
            e+=' '
        e = ''.join([i for i in e if not i.isdigit()])
        f=''
        for i in e:
            if i.isalpha() or i==' ':
                f+=i
            else:
                f+=' '
        urls.append(f)
        
    titles=[]
    for e in title:    
        e = ''.join([i for i in e if not i.isdigit()])
        f=''
        for i in e:
            if i.isalpha() or i==' ':
                #print(i)
                f+=i
            else:
                f+=' '
        f = re.sub(' +',' ',f) # replace series of spaces with single space
        titles.append(f)
    
    data=[]
    for i in range(len(title)):
        s=titles[i]+' '+urls[i]
        data.append(str(s)
                    
    return(data)

In [4]:
def preprocess_title():
    pattern=r'<title>.*</title>'

    zf = zipfile.ZipFile('train.zip') 
    df = pd.read_csv(zf.open('html_data.csv'),chunksize=1)

    title=[]
    chunksize = 1
    for chunk in pd.read_csv(zf.open('html_data.csv'), chunksize=chunksize):
        a=chunk.Html
        idd=chunk.Webpage_id
        c=re.findall(pattern,a.iloc[0])
        if len(c)!=0:
            title.append([idd.iloc[0],c[0][7:-8]])
        else:
            title.append([idd.iloc[0],' '])
    return(title)

In [5]:
%%time
title=preprocess_title()

Wall time: 12min 47s


In [7]:
title_data=pd.DataFrame(title)
title_data.columns=['Webpage_id','Html']

new_data=pd.merge(train,title_data,on=['Webpage_id'],how='left')
new_data.to_csv('processed_train_data.csv',index=False)

In [5]:
data=pd.read_csv('processed_train_data.csv',lineterminator='\n')

In [6]:
data.head()

Unnamed: 0,Webpage_id,Domain,Url,Tag,Html
0,1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting,news,"Tecfidera, Gilenya and Aubagio&#039;s 3-way battle for MS share is about to heat up | FiercePharma\r"
1,2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says,news,"Novo equipped to weather the storm in the U.S. diabetes market, CEO says | FiercePharma\r"
2,3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker,news,"Another exec departs troubled Endo--and this time, it&#039;s for another drugmaker | FiercePharma\r"
3,4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no,news,Would Teva buy Korea&#039;s Celltrion to beef up in biosimilars? It wouldn&#039;t say no | FiercePharma\r
4,5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness,news,Restasis-maker Allergan recruits actress Marisa Tomei to drive dry eye awareness | FiercePharma\r


In [7]:
data.columns

Index(['Webpage_id', 'Domain', 'Url', 'Tag', 'Html\r'], dtype='object')

In [8]:
def train_val_split(train):
    grp=train.groupby('Tag')
    
    test_set=[]
    train_set=[]
    test_tag_set=[]
    train_tag_set=[]
    count=0
    for unique_tags in train.Tag.unique():
        tag=grp.get_group(unique_tags)
        s=set(tag.Domain)
        count+=len(s)
        a=list(s)
        #shuffle(a)
        for i in range(len(a)):
            if (i+1)%3==0:
                test_set.append(a[i])
                test_tag_set.append(unique_tags)
            else:
                train_set.append(a[i])
                train_tag_set.append(unique_tags)
    
    
    train_domain=pd.DataFrame(train_set)
    train_domain.columns=['Domain']
    train_domain['Tag']=train_tag_set
    train_data=pd.merge(train_domain,train,on=['Domain','Tag'],how='left')
    
    val_domain=pd.DataFrame(test_set)
    val_domain.columns=['Domain']
    val_domain['Tag']=test_tag_set
    val_data=pd.merge(val_domain,train,on=['Domain','Tag'],how='left')
    
    return(train_data,val_data)
    

In [9]:
train_data,val_data = train_val_split(data)

In [10]:
train_urls=preprocess(train_data.Url,train_data['Html\r'])
train_target=train_data.Tag.values

val_urls = preprocess(val_data.Url,val_data['Html\r'])
val_target = val_data.Tag.values

# SVM

In [41]:
from sklearn.linear_model import SGDClassifier

In [45]:
%%time
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=0.0001, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_urls, train_target)



Wall time: 1.61 s


In [46]:
pred_train = text_clf_svm.predict(train_urls)
print('train_score:',np.mean(pred_train == train_target))

train_score: 0.9683721720417929


In [47]:
pred_val = text_clf_svm.predict(val_urls)
print(np.mean(pred_val == val_target))

0.6977937120794264


In [29]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(urls, target)


gs_clf_svm.best_score_
gs_clf_svm.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}