In [48]:
import pandas as pd
from json import loads
import spacy

In [52]:
nlp = spacy.load('en_core_web_trf')

In [66]:
data = pd.read_csv('train.tsv', sep='\t', header=0)
boilerplate = pd.DataFrame([loads(v) for v in data['boilerplate'].to_dict().values()])
data = data.drop('boilerplate', axis=True)
data = pd.concat([boilerplate, data], axis=1)

In [33]:
pd.set_option('max_colwidth', 400)

In [38]:
data.columns

Index(['title', 'body', 'url', 'related', 'url', 'urlid', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label'],
      dtype='object')

In [34]:
info="""FieldName	Type	Description
url	string	Url of the webpage to be classified
urlid	integer	StumbleUpon's unique identifier for each url
boilerplate	json	Boilerplate text
alchemy_category	string	Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score	double	Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize	double	Average number of words in each link
commonLinkRatio_1	double	# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2	double	# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3	double	# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4	double	# of links sharing at least 1 word with 4 other links / # of links
compression_ratio	double	Compression achieved on this page via gzip (measure of redundancy)
embed_ratio	double	Count of number of <embed>  usage
frameBased	integer (0 or 1)	A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio	double	Ratio of iframe markups over total number of markups
hasDomainLink	integer (0 or 1)	True (1) if it contains an <a>  with an url with domain
html_ratio	double	Ratio of tags vs text in the page
image_ratio	double	Ratio of <img> tags vs text in the page
is_news	integer (0 or 1)	True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain	integer (0 or 1)	True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore	double	Percentage of words on the page that are in hyperlink's text
news_front_page	integer (0 or 1)	True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters	integer	Page's text's number of alphanumeric characters
numberOfLinks	integer	Number of <a>  markups
numwords_in_url	double	Number of words in url
parametrizedLinkRatio	double	A link is parametrized if it's url contains parameters  or has an attached onClick event
spelling_errors_ratio	double	Ratio of words not found in wiki (considered to be a spelling mistake)
label	integer (0 or 1)	User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only"""
pd.DataFrame([line.split('\t') for line in info.splitlines()])

Unnamed: 0,0,1,2
0,FieldName,Type,Description
1,url,string,Url of the webpage to be classified
2,urlid,integer,StumbleUpon's unique identifier for each url
3,boilerplate,json,Boilerplate text
4,alchemy_category,string,Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
5,alchemy_category_score,double,Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
6,avglinksize,double,Average number of words in each link
7,commonLinkRatio_1,double,# of links sharing at least 1 word with 1 other links / # of links
8,commonLinkRatio_2,double,# of links sharing at least 1 word with 2 other links / # of links
9,commonLinkRatio_3,double,# of links sharing at least 1 word with 3 other links / # of links


In [35]:
boilerplate.head()

Unnamed: 0,title,body,url,related
0,"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries",A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone ...,bloomberg news 2010 12 23 ibm predicts holographic calls air breathing batteries by 2015 html,
1,"The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races",And that can be carried on a plane without the hassle too The Omega E Gun Starting Pistol Omega It s easy to take for granted just how insanely close some Olympic races are and how much the minutiae of it all can matter The perfect example is the traditional starting gun Seems easy You pull a trigger and the race starts Boom What people don t consider When a conventional gun goes off the sound...,popsci technology article 2012 07 electronic futuristic starting gun eliminates advantages races,
2,Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health,Apples The most popular source of antioxidants in our diet one apple has an antioxidant effect equivalent to 1 500 mg of vitamin C Apples are loaded with protective flavonoids which may prevent heart disease and cancer Next Papayas With 250 percent of the RDA of vitamin C a papaya can help kick a cold right out of your system The beta carotene and vitamins C and E in papayas reduce inflammatio...,menshealth health flu fighting fruits cm mmc Facebook Mens Health Content Health Fight Flu With Fruit,
3,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot of problems with sleep It took me very long to fall asleep I was easily awaken and I simply wasn t getting enough of rest at night I didn t want to take medication and this led me to learn several tips and tricks that really helped me to overcome my insomnia Some of these tips I try to follow regularly Don t worry about not getting enough sleep Tr...,dumblittleman 2007 12 10 foolproof tips for better sleep html,
4,The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen,Jersey sales is a curious business Whether you re buying the stylish top to represent your favorite team player or color you re always missing out on better artwork With No 18 Colts jerseys continuing to flood the streets it s about time we educate the sports public about the real masterpieces that have yet to be embraced Forget importance or legacy these upcoming selections will be based sole...,bleacherreport articles 1205138 the 50 coolest jerseys you didnt know existed show_full,


In [39]:
data.head()

Unnamed: 0,title,body,url,related,url.1,urlid,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries",A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone ...,bloomberg news 2010 12 23 ibm predicts holographic calls air breathing batteries by 2015 html,,http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html,4042,business,0.789131,2.055556,0.676471,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,"The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races",And that can be carried on a plane without the hassle too The Omega E Gun Starting Pistol Omega It s easy to take for granted just how insanely close some Olympic races are and how much the minutiae of it all can matter The perfect example is the traditional starting gun Seems easy You pull a trigger and the race starts Boom What people don t consider When a conventional gun goes off the sound...,popsci technology article 2012 07 electronic futuristic starting gun eliminates advantages races,,http://www.popsci.com/technology/article/2012-07/electronic-futuristic-starting-gun-eliminates-advantages-races,8471,recreation,0.574147,3.677966,0.508021,...,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health,Apples The most popular source of antioxidants in our diet one apple has an antioxidant effect equivalent to 1 500 mg of vitamin C Apples are loaded with protective flavonoids which may prevent heart disease and cancer Next Papayas With 250 percent of the RDA of vitamin C a papaya can help kick a cold right out of your system The beta carotene and vitamins C and E in papayas reduce inflammatio...,menshealth health flu fighting fruits cm mmc Facebook Mens Health Content Health Fight Flu With Fruit,,http://www.menshealth.com/health/flu-fighting-fruits?cm_mmc=Facebook-_-MensHealth-_-Content-Health-_-FightFluWithFruit,1164,health,0.996526,2.382883,0.562016,...,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot of problems with sleep It took me very long to fall asleep I was easily awaken and I simply wasn t getting enough of rest at night I didn t want to take medication and this led me to learn several tips and tricks that really helped me to overcome my insomnia Some of these tips I try to follow regularly Don t worry about not getting enough sleep Tr...,dumblittleman 2007 12 10 foolproof tips for better sleep html,,http://www.dumblittleman.com/2007/12/10-foolproof-tips-for-better-sleep.html,6684,health,0.801248,1.543103,0.4,...,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen,Jersey sales is a curious business Whether you re buying the stylish top to represent your favorite team player or color you re always missing out on better artwork With No 18 Colts jerseys continuing to flood the streets it s about time we educate the sports public about the real masterpieces that have yet to be embraced Forget importance or legacy these upcoming selections will be based sole...,bleacherreport articles 1205138 the 50 coolest jerseys you didnt know existed show_full,,http://bleacherreport.com/articles/1205138-the-50-coolest-jerseys-you-didnt-know-existed?show_full=,9006,sports,0.719157,2.676471,0.5,...,1,1,14,0,12032,162,10,0.098765,0.082569,0


In [68]:
data.columns = ['title', 'body', 'url', 'related', 'href', 'urlid', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label']

In [71]:
data = data.drop(['related', 'href', 'urlid'],axis=1)

In [72]:
data.isna().sum()

title                             13
body                              57
url                               60
alchemy_category                   0
alchemy_category_score             0
avglinksize                        0
commonlinkratio_1                  0
commonlinkratio_2                  0
commonlinkratio_3                  0
commonlinkratio_4                  0
compression_ratio                  0
embed_ratio                        0
framebased                         0
frameTagRatio                      0
hasDomainLink                      0
html_ratio                         0
image_ratio                        0
is_news                            0
lengthyLinkDomain                  0
linkwordscore                      0
news_front_page                    0
non_markup_alphanum_characters     0
numberOfLinks                      0
numwords_in_url                    0
parametrizedLinkRatio              0
spelling_errors_ratio              0
label                              0
d

In [73]:
title_nlp = data['title'].dropna().apply(nlp)

In [112]:
text = title_nlp.apply(lambda doc : [token.text for token in doc]).rename("text")
lemma = title_nlp.apply(lambda doc : [token.lemma_ for token in doc]).rename("lemma")
pos = title_nlp.apply(lambda doc : [token.pos_ for token in doc]).rename("pos")
tag = title_nlp.apply(lambda doc : [token.tag_ for token in doc]).rename("tag")
is_alpha = title_nlp.apply(lambda doc : [token.is_alpha for token in doc]).rename("is_alpha")
is_stop = title_nlp.apply(lambda doc : [token.is_stop for token in doc]).rename("is_stop")
noun_chunks = title_nlp.apply(lambda doc : list(doc.noun_chunks)).rename("noun_chunks")
ent_text = title_nlp.apply(lambda doc : [ent.text for ent in doc.ents]).rename("ent_text")
ent_label = title_nlp.apply(lambda doc : [ent.label_ for ent in doc.ents]).rename("ent_label")
pd.concat([text, lemma, pos, tag, is_alpha, is_stop, noun_chunks, ent_text, ent_label],axis=1)

Unnamed: 0,text,lemma,pos,tag,is_alpha,is_stop,noun_chunks,ent_text,ent_label
0,"[IBM, Sees, Holographic, Calls, Air, Breathing, Batteries, ibm, sees, holographic, calls, ,, air, -, breathing, batteries]","[IBM, see, holographic, call, air, breathe, battery, ibm, see, holographic, call, ,, air, -, breathe, battery]","[PROPN, VERB, ADJ, NOUN, NOUN, VERB, NOUN, PROPN, VERB, ADJ, NOUN, PUNCT, NOUN, PUNCT, VERB, NOUN]","[NNP, VBZ, JJ, NNS, NN, VBG, NNS, NNP, VBZ, JJ, NNS, ,, NN, HYPH, VBG, NNS]","[True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True]","[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]","[(IBM), (Holographic, Calls), (Air, Breathing, Batteries), (ibm), (holographic, calls), (air, -, breathing, batteries)]","[IBM, ibm]","[ORG, ORG]"
1,"[The, Fully, Electronic, Futuristic, Starting, Gun, That, Eliminates, Advantages, in, Races, the, fully, electronic, ,, futuristic, starting, gun, that, eliminates, advantages, in, races, the, fully, electronic, ,, futuristic, starting, gun, that, eliminates, advantages, in, races]","[the, fully, electronic, futuristic, starting, gun, that, eliminate, advantage, in, race, the, fully, electronic, ,, futuristic, starting, gun, that, eliminate, advantage, in, race, the, fully, electronic, ,, futuristic, starting, gun, that, eliminate, advantage, in, race]","[DET, ADV, ADJ, ADJ, NOUN, NOUN, DET, VERB, NOUN, ADP, NOUN, DET, ADV, ADJ, PUNCT, ADJ, NOUN, NOUN, DET, VERB, NOUN, ADP, NOUN, DET, ADV, ADJ, PUNCT, ADJ, NOUN, NOUN, DET, VERB, NOUN, ADP, NOUN]","[DT, RB, JJ, JJ, NN, NN, WDT, VBZ, NNS, IN, NNS, DT, RB, JJ, ,, JJ, NN, NN, WDT, VBZ, NNS, IN, NNS, DT, RB, JJ, ,, JJ, NN, NN, WDT, VBZ, NNS, IN, NNS]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True]","[True, False, False, False, False, False, True, False, False, True, False, True, False, False, False, False, False, False, True, False, False, True, False, True, False, False, False, False, False, False, True, False, False, True, False]","[(The, Fully, Electronic, Futuristic, Starting, Gun), (Advantages), (Races), (the, fully, electronic, ,, futuristic, starting, gun), (advantages), (races), (the, fully, electronic, ,, futuristic, starting, gun), (advantages), (races)]",[],[]
2,"[Fruits, that, Fight, the, Flu, fruits, that, fight, the, flu, |, cold, &, flu, |, men, 's, health]","[fruit, that, fight, the, flu, fruit, that, fight, the, flu, |, cold, &, flu, |, man, 's, health]","[NOUN, DET, VERB, DET, NOUN, NOUN, DET, VERB, DET, NOUN, PUNCT, NOUN, CCONJ, NOUN, PUNCT, NOUN, PART, NOUN]","[NNS, WDT, VBP, DT, NN, NNS, WDT, VBP, DT, NN, :, NN, CC, NN, :, NNS, POS, NN]","[True, True, True, True, True, True, True, True, True, True, False, True, False, True, False, True, False, True]","[False, True, False, True, False, False, True, False, True, False, False, False, False, False, False, False, True, False]","[(Fruits), (the, Flu), (fruits), (the, flu), (cold), (flu), (men, 's, health)]",[],[]
3,"[10, Foolproof, Tips, for, Better, Sleep]","[10, foolproof, tip, for, well, sleep]","[NUM, ADJ, NOUN, ADP, ADJ, NOUN]","[CD, JJ, NNS, IN, JJR, NN]","[False, True, True, True, True, True]","[False, False, False, True, False, False]","[(10, Foolproof, Tips), (Better, Sleep)]",[10],[CARDINAL]
4,"[The, 50, Coolest, Jerseys, You, Didn, t, Know, Existed, coolest, jerseys, you, have, n't, seen]","[the, 50, cool, jersey, you, didn, t, know, exist, cool, jersey, you, have, n't, see]","[DET, NUM, ADJ, NOUN, PRON, AUX, ADV, VERB, VERB, ADJ, NOUN, PRON, AUX, PART, VERB]","[DT, CD, JJS, NNS, PRP, VBD, RB, VB, VBD, JJS, NNS, PRP, VBP, RB, VBN]","[True, False, True, True, True, True, True, True, True, True, True, True, True, False, True]","[True, False, False, False, True, False, False, False, False, False, False, True, True, True, False]","[(The, 50, Coolest, Jerseys), (You), (coolest, jerseys), (you)]",[50],[CARDINAL]
...,...,...,...,...,...,...,...,...,...
7390,"[Kno, Raises, 46, Million, More, To, Build, Most, Powerful, Tablet, Anyone, Has, Ever, Made, kno, raises, $, 46, million, more, to, build, &, #, 8220;most, powerful, tablet, anyone, has, ever, made&#8221, ;]","[Kno, raise, 46, million, More, to, build, most, powerful, tablet, anyone, have, ever, make, kno, raise, $, 46, million, more, to, build, &, #, 8220;most, powerful, tablet, anyone, have, ever, made&#8221, ;]","[PROPN, VERB, NUM, NUM, ADJ, PART, VERB, ADV, ADJ, NOUN, PRON, AUX, ADV, VERB, PROPN, VERB, SYM, NUM, NUM, ADJ, PART, VERB, X, X, X, ADJ, NOUN, PRON, AUX, ADV, X, PUNCT]","[NNP, VBZ, CD, CD, JJR, TO, VB, RBS, JJ, NN, NN, VBZ, RB, VBN, NNP, VBZ, $, CD, CD, JJR, TO, VB, XX, XX, XX, JJ, NN, NN, VBZ, RB, XX, :]","[True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, False, False, False, True, True, True, True, True, False, False]","[False, False, False, False, True, True, False, True, False, False, True, True, True, True, False, False, False, False, False, True, True, False, False, False, False, False, False, True, True, True, False, False]","[(Kno), (Most, Powerful, Tablet), (Anyone), (kno), (powerful, tablet), (anyone)]","[46 Million, $46 million]","[CARDINAL, MONEY]"
7391,"[Why, I, Miss, College]","[why, I, miss, college]","[ADV, PRON, VERB, NOUN]","[WRB, PRP, VBP, NN]","[True, True, True, True]","[True, True, False, False]","[(I), (College)]",[],[]
7392,"[Sweet, Potatoes, Eat, This, Not, That, , i, 'm, eating, this, ,, not, that]","[sweet, potato, eat, this, not, that, , I, be, eat, this, ,, not, that]","[ADJ, NOUN, VERB, DET, PART, DET, SPACE, PRON, AUX, VERB, DET, PUNCT, PART, DET]","[JJ, NNS, VBP, DT, RB, DT, _SP, PRP, VBP, VBG, DT, ,, RB, DT]","[True, True, True, True, True, True, False, True, False, True, True, False, True, True]","[False, False, False, True, True, True, False, True, True, False, True, False, True, True]",[(i)],[],[]
7393,"[Naturally, Ella]","[naturally, Ella]","[ADV, PROPN]","[RB, NNP]","[True, True]","[False, False]","[(Naturally, Ella)]",[Ella],[PERSON]
