In [1]:
import numpy as np
import pandas as pd
import json

data = pd.read_csv("stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...


## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender.  

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

> ### Let's try extracting some of the text content.
> ### Create a feature for the title containing 'recipe'. Is the % of evegreen websites higher or lower on pages that have recipe in the the title?

In [3]:
data['recipe'] = data['title'].str.contains('recipe')

 ### Demo: Use of the Count Vectorizer

In [4]:
# this line converts rows with empty titles to titles that are of the string variable type
titles = data['title'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.85,
                             min_df=5,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `transform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

In [5]:
len(data)

7395

In [6]:
pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

Unnamed: 0,000,10,10 best,10 foods,10 things,10 ways,100,100 hottest,101,101 cookbooks,...,york fashion,york slideshows,york village,youbeauty,youbeauty com,young,youtube,yummy,zoom,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


 ### Demo: Build a random forest model to predict evergreeness of a website using the title features

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)
    
# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).todense()
y = data['label']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.78917292  0.80298715  0.79880434], Average AUC 0.796988137898


### Exercise: Build a random forest model to predict evergreeness of a website using the title features and quantitative features

In [21]:
# data = pd.concat([data, pd.get_dummies(data.alchemy_category)], axis=1)

In [9]:
model = RandomForestClassifier(n_estimators = 100)

In [11]:
# Use `transform` to generate the sample X word matrix - one column per feature (word or n-grams)
titles_transformed = vectorizer.transform(titles)
X_text_features = pd.DataFrame(titles_transformed.todense(), columns=vectorizer.get_feature_names())

# Identify the features you want from the original dataset
# dummy_categories = data.columns[-14: ]
other_features_columns = ['image_ratio', 
          'html_ratio', 
          'non_markup_alphanum_characters',
          'compression_ratio',
          'image_ratio',
          'html_ratio',
          'frameTagRatio']

# predictor_list = np.hstack([other_features_columns, dummy_categories])
other_features = data[other_features_columns]

# Stack them horizontally together
# This takes all of the word/n-gram columns and appends on two more columns for `html_ratio` and `image_ratio`
X = pd.concat([X_text_features, other_features], axis=1)

scores = cross_val_score(model, X.values, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

# What features of these are most important?
model.fit(X, y)

# create a dataframe out of the predictors used to fit the model
all_feature_names = vectorizer.get_feature_names() + other_features_columns
feature_importances = pd.DataFrame({'Features' : all_feature_names, 'Importance Score': model.feature_importances_})
print feature_importances.sort('Importance Score', ascending=False).head(20)

CV AUC [ 0.81372235  0.82102437  0.81465813], Average AUC 0.816468286278
                            Features  Importance Score
2047  non_markup_alphanum_characters          0.067081
2051                   frameTagRatio          0.064980
2050                      html_ratio          0.050906
2046                      html_ratio          0.049955
2048               compression_ratio          0.044064
2045                     image_ratio          0.039158
2049                     image_ratio          0.037028
1483                          recipe          0.028962
1503                         recipes          0.016558
380                        chocolate          0.009796
707                          fashion          0.007809
365                          chicken          0.007452
1260                            news          0.007095
1708                          sports          0.006477
755                             food          0.006298
297                             cake          0

 ### Exercise: Build a random forest model to predict evergreeness of a website using the body features

In [12]:
body_text = data['body'].fillna('')

vectorizer = CountVectorizer(max_features=2000,
                             max_df=0.20,
                             min_df=5,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary
vectorizer.fit(body_text)

# # Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(body_text).todense()

model = RandomForestClassifier(n_estimators = 100)
scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.84415745  0.86097365  0.84849957], Average AUC 0.851210224642


In [56]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort('Importance Score', ascending=False).head(20)

         Features  Importance Score
1420      recipes          0.025939
266        butter          0.011931
906   ingredients          0.010886
463          cups          0.010785
1499         salt          0.010117
316        cheese          0.010022
163        baking          0.009349
1224         oven          0.009061
409          cook          0.008791
161          bake          0.008463
416       cooking          0.008318
1723        sugar          0.007945
232          bowl          0.007883
1126      mixture          0.007687
697         flour          0.006806
1262       pepper          0.006333
1508        sauce          0.006327
438         cream          0.006062
650       fashion          0.005797
1121          mix          0.005176


In [13]:
len(vectorizer.get_feature_names())

2000

 ### Exercise: Use `TfIdfVectorizer` instead of `CountVectorizer` - is this an improvement? Try with title and body

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
body_text = data['body'].fillna('')

vectorizer = TfidfVectorizer(max_features=2000,
                             max_df=0.20,
                             min_df=5,
                             ngram_range=[1, 2], 
                             stop_words='english')

# Use `fit` to learn the vocabulary
vectorizer.fit(body_text)

# # Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(body_text).todense()

model = RandomForestClassifier(n_estimators = 100)
scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.84852093  0.86020158  0.85356209], Average AUC 0.854094868452
