In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipies or reviews (as opposed to current events)
- Look at some examples

In [5]:
df_data = pd.read_csv('../../assets/datasets/train.csv', sep='\t', na_values='?')

df_data.head()


Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html,4042,"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1.0,1,24,0.0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-07/electronic-futuristic-starting-gun-eliminates-advantages-races,8471,"{""title"":""The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races"",""body"":""And that can be carried on a pla...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.20349,0.088652,1.0,1,40,0.0,4973,187,9,0.181818,0.125448,1
2,http://www.menshealth.com/health/flu-fighting-fruits?cm_mmc=Facebook-_-MensHealth-_-Content-Health-_-FightFluWithFruit,1164,"{""title"":""Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health"",""body"":""Apples The most popular source of antioxidants in our diet one apple has an antioxidant effect equivalent to 1 500 mg of vitamin C Apples are loaded with protective flavonoids which may prevent hea...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1.0,1,55,0.0,2240,258,11,0.166667,0.057613,1
3,http://www.dumblittleman.com/2007/12/10-foolproof-tips-for-better-sleep.html,6684,"{""title"":""10 Foolproof Tips for Better Sleep "",""body"":""There was a period in my life when I had a lot of problems with sleep It took me very long to fall asleep I was easily awaken and I simply wasn t getting enough of rest at night I didn t want to take medication and this led me to learn sever...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1.0,0,24,0.0,2737,120,5,0.041667,0.100858,1
4,http://bleacherreport.com/articles/1205138-the-50-coolest-jerseys-you-didnt-know-existed?show_full=,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen"",""body"":""Jersey sales is a curious business Whether you re buying the stylish top to represent your favorite team player or color you re always missing out on better artwork With No 18 Colts jerseys continu...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1.0,1,14,0.0,12032,162,10,0.098765,0.082569,0


In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Fill NAs
df_data['alchemy_category'].fillna('Unknown', inplace=True)
df_data['alchemy_category_score'].fillna(0.40, inplace=True)
df_data[df_data.alchemy_category == 'Unknown']['alchemy_category_score'].value_counts()

# Feature Selection
X = df_data[[x for x in df_data.columns if x not in ['label','url','is_news','news_front_page','url_id']]]
X = pd.get_dummies(X)
y = df_data['label']


X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7395 entries, 0 to 7394
Columns: 7429 entries, urlid to alchemy_category_weather
dtypes: float64(7421), int64(8)
memory usage: 419.1 MB


In [7]:
from sklearn import preprocessing

# Normalizing data and getting coefeccients
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)
model = LogisticRegression()
model.fit(X_norm, y)
coeffs = pd.DataFrame(model.coef_, columns = [X.columns.values])
coeffs = coeffs.transpose()
           
coeffs.head()


Unnamed: 0,0
urlid,-0.15605
alchemy_category_score,-0.064963
avglinksize,0.010578
commonlinkratio_1,0.137689
commonlinkratio_2,0.117916


In [9]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
selector.fit(X_norm, y)

keep = pd.DataFrame(selector.get_support())
keeper = keep.transpose()
keeper.columns = [x for x in X.columns.values]
keep = keeper.transpose()

print 'Based on K best here is what I would keep...'
keep_2 = keep[keep[0] == True]

keep_2

Based on K best here is what I would keep...


Unnamed: 0,0
commonlinkratio_2,True
commonlinkratio_3,True
frameTagRatio,True
linkwordscore,True
non_markup_alphanum_characters,True
alchemy_category_arts_entertainment,True
alchemy_category_business,True
alchemy_category_computer_internet,True
alchemy_category_recreation,True
alchemy_category_sports,True


In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=10, step=1)
rfe.fit(X_norm, y)

rfe_values = rfe.support_
rfe_values = X_norm.columns[rfe_values].values

print "Final Features to Keep According to RFE"
print rfe_values

In [None]:
# Feature engineering to construct new features
from sklearn.feature_extraction.text import CountVectorizer

vector_1 = CountVectorizer(
    binary=True,  
    stop_words='english', 
    max_features=50, 
)

X_1 = vector_1.fit_transform(df_data.boilerplate).todense()
df_X_1 = pd.DataFrame(X_1, columns=vector_1.get_feature_names()).head()

In [None]:
# Train a logit model

def model(features):
    X = df_data[features]
    y = df_data['label']

    logreg = LogisticRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    logreg.fit(X_train, y_train)
    
    predictions = logreg.predict(X_test)
    
    confustion_matrix = confusion_matrix(y_test, predictions)
   
    df_final = pd.DataFrame(confusion_matrix, index=['Green','Not Green'], columns=['Green Predicted', 'Green NOT Predicted'])
                                       
    print df_final
    print classification_report(y_test, predictions)


print "K_Best: ",
modeler(keep_2)

print "RFE: ",
modeler(rfe_values)