In [112]:
import actionability
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from selenium import webdriver

Using the <a href="https://code.google.com/p/word2vec/">Google News Word2Vec model<a>:

In [2]:
gnews_model = Word2Vec.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
with open('../data/stopwords.txt') as f:
    stopwords = f.read().split(' ')

Load up the CrowdFlower data:

In [39]:
df = pd.read_csv('../data/actiondataset1.csv',index_col=0)

In [148]:
df.head()

Unnamed: 0,_unit_id,_created_at,_golden,actionrating,action,action_inst,domain,site
24,716816272,5/7/2015 16:55:09,False,No,TicketsAction,TicketsAction: Does the page have a Web Link t...,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...
25,716816272,5/7/2015 16:55:48,False,No,TicketsAction,TicketsAction: Does the page have a Web Link t...,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...
26,716816272,5/7/2015 16:56:04,False,No,TicketsAction,TicketsAction: Does the page have a Web Link t...,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...
27,716816272,5/7/2015 16:57:08,False,Yes,TicketsAction,TicketsAction: Does the page have a Web Link t...,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...
28,716816272,5/7/2015 16:57:17,False,Yes,TicketsAction,TicketsAction: Does the page have a Web Link t...,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...


Aggregate ratings by averaging across users per website to obtain a percentage action rating.

In [149]:
agg_df = actionability.aggregate_ratings(df)
agg_df.head()

Unnamed: 0_level_0,action,action_inst,actionrating,domain,site,rating_count
_unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
716816272,TicketsAction,TicketsAction: Does the page have a Web Link t...,0.461538,todaytix.com,http://blog.todaytix.com/tagged/Shoshana+Bean?...,13
716816295,ListenAction,ListenAction: Does the web page offer you the ...,0.866667,rdio.com,http://www.rdio.com/browse/?ppKey\u003dstation...,15
716816300,TicketsAction,TicketsAction: Does the page have a Web Link t...,0.555556,todaytix.com,http://blog.todaytix.com/tagged/Jason+Robert+B...,9
716816328,ListenAction,ListenAction: Does the web page offer you the ...,0.5,spotify.com,https://press.spotify.com/int/2013/12/17/spoti...,10
716817009,ListenAction,ListenAction: Does the web page offer you the ...,0.0,rhapsody.com,http://api.rhapsody.com/v1/genres/g.397/posts?...,9


In [152]:
agg_df.domain.value_counts()

spotify.com         101
rdio.com            101
songkick.com        100
stubhub.com         100
razorgator.com      100
livenation.com      100
pandora.com         100
soundcloud.com      100
seatgeek.com        100
bandsintown.com     100
ticketmaster.com    100
rhapsody.com        100
thrillcall.com      100
todaytix.com          2
gametime.co           1
dtype: int64

We'll exclude todaytix and gametime in our cross-validation.

In [153]:
agg_df = agg_df[(agg_df.domain != 'todaytix.com') & (agg_df.domain != 'gametime.co')]

Create a DataFrame of features <code>vec_df</code> by retrieving the source of each webpage and vectorize using <code>actionability.vectorize_from_source</code>. This takes a while, so I include a precomputed CSV file (<code>vec_df.csv</code>).

In [None]:
#source_ser = actionability.get_source_series(agg_df.site, driver=webdriver.PhantomJS())
#vec_series = source_ser.apply(lambda source:actionability.vectorize_from_source(source, gnews_model, stopwords))
#vec_df = actionability.vector_series_to_df(vec_series)

vec_df = pd.read_csv('vec_df.csv',index_col=0)

In [154]:
vec_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
_unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
716816272,10.069473,9.321749,-4.064606,48.464424,-41.245636,5.038231,-3.107137,-41.918358,37.715382,13.004594,...,15.889442,33.140083,-34.716434,9.090375,-1.326906,-58.531166,3.250105,-54.855858,-36.859932,4.588878
716816295,-11.258841,59.295261,19.907583,115.833015,-101.33934,18.915688,18.909508,-97.30146,123.350487,57.115135,...,-0.428824,115.269157,-94.980164,26.671297,-9.354056,-77.142555,-27.885612,-69.142197,-76.349159,35.548969
716816300,8.139248,14.705843,-2.836876,58.641243,-45.268124,2.378229,-5.284815,-52.851181,45.638798,17.13373,...,16.641171,38.699951,-41.015625,15.234818,-3.316744,-72.264549,2.748202,-68.977898,-45.93325,4.678903
716816328,3.140223,15.963882,-9.070687,47.742367,-41.611755,-10.543368,2.995939,-58.924545,63.105946,30.748421,...,4.946953,33.525127,-37.237831,22.710173,-6.841942,-45.907303,-16.709261,-52.530914,-16.527824,13.478872
716817009,0.165652,-0.015591,0.128503,-0.182143,-0.102472,-0.019821,-0.15138,0.138392,0.061157,-0.07742,...,-0.006594,0.066137,0.177061,-0.143382,-0.050867,-0.093446,-0.005401,-0.432284,-0.129643,-0.027799


For each action, perform leave-one-label-out cross-validation using the domain names as labels.

In [147]:
clf = LogisticRegression(C=0.6)
gb = agg_df.groupby('action')
for action in gb.groups.keys():
    
    all_Y_true = []
    all_Y_pred = []
    
    print action
    action_index = gb.groups[action]
    X_df = vec_df.ix[action_index]
    Y_ser = (agg_df.actionrating > 0.5).ix[action_index]
    
    rows = []
    
    for domain in np.unique(agg_df.ix[action_index].domain):
        train_index = X_df[agg_df.domain != domain].index
        test_index = X_df[agg_df.domain == domain].index
        
        
        X_train = X_df.ix[train_index].values
        Y_train = Y_ser[train_index].values
        
        X_test = X_df.ix[test_index].values
        Y_true = Y_ser[test_index].values
        
       
        clf.fit(X_train, Y_train)
        
        Y_pred = clf.predict(X_test)
        prec, recall, fscore, support = precision_recall_fscore_support(Y_true, Y_pred, average='binary')
        accuracy = (Y_true == Y_pred).mean()
        rows.append({'domain':domain, 'precision':prec, 'recall':recall, 'accuracy':accuracy, 'true_rate':np.mean(Y_true)})
        ac_df = pd.DataFrame(rows)
        ac_df.set_index('domain',inplace=True)
        
        all_Y_true += list(Y_true)
        all_Y_pred += list(Y_pred)
        
    print ac_df[['precision','recall','accuracy','true_rate']], '\n'
    
    prec, recall, fscore, support = precision_recall_fscore_support(all_Y_true, all_Y_pred, average='binary')
    accuracy = (np.array(all_Y_true) == np.array(all_Y_pred)).mean()
    print 'overall precision: %f' % prec
    print 'overall recall: %f' % recall
    print 'overall accuracy: %f' % accuracy, '\n'
    

TicketsAction
                  precision    recall  accuracy  true_rate
domain                                                    
bandsintown.com    0.050000  0.500000      0.80       0.02
livenation.com     0.654545  0.782609      0.71       0.46
razorgator.com     0.938272  0.835165      0.80       0.91
seatgeek.com       0.770000  1.000000      0.77       0.77
songkick.com       1.000000  0.355556      0.71       0.45
stubhub.com        0.780000  1.000000      0.78       0.78
thrillcall.com     0.741379  0.796296      0.74       0.54
ticketmaster.com   0.711538  0.513889      0.50       0.72 

overall precision: 0.755187
overall recall: 0.782796
overall accuracy: 0.726250 

ListenAction
                precision    recall  accuracy  true_rate
domain                                                  
pandora.com      0.500000  0.250000  0.440000   0.560000
rdio.com         0.000000  0.000000  0.168317   0.801980
rhapsody.com     0.846154  0.916667  0.820000   0.720000
soundcloud.com

These results differ somewhat from the results given in the slides, owing perhaps to the webpages changing over time.