In [11]:
import pandas
import numpy as np
from metrics import Metrics
from sklearn import metrics
import glob
import collections
import os
import pandas
import json

In [53]:
subreddits = ['AskMen', 'askscience', 'AskWomen', 'atheism', 'changemyview',
              'Fitness', 'politics', 'worldnews']


def MakePredictions(filename):
    df = pandas.read_csv(filename, index_col=0)
    df[subreddits] = df[subreddits].multiply(rawdata['length'], axis='index')
    df['idcode'] = rawdata['idcode']
    df['length'] = rawdata['length']
    grps = df.groupby('idcode')
    
    labels = grps.label.first()
    totals = grps.agg(sum)
    totals['label'] = labels
    
    totals[subreddits] = totals[subreddits].subtract(totals[subreddits].mean(1), axis=0)
    totals[subreddits] = totals[subreddits].divide(np.std(totals[subreddits], axis=1), axis=0)
    
    ss = np.array(subreddits)
    totals['pred'] = ss[np.argmin(totals[subreddits].values, axis=1)]

    return totals

In [54]:
rawdata = pandas.read_csv('/s0/ajaech/reddit05.tsv.bz2', sep='\t', header=None, compression='bz2',
                    nrows=20000)
rawdata.columns = ['subreddit', 'idcode', 'text']
rawdata['length'] = rawdata['text'].apply(lambda x: len(x.split()) + 1)
rawdata.head()

Unnamed: 0,subreddit,idcode,text,length
0,politics,cqug90i,are you really implying we return to those tim...,17
1,politics,cqug90i,"if so , you wo n't have much luck selling the ...",25
2,todayilearned,cqug912,get back to your pott harry .,8
3,atheism,cqug91m,he 's not claiming a god exists .,9
4,atheism,cqug91m,he 's positing a god exists and he 's positing...,16


In [55]:
def GetPPL(filename):
    dirname = os.path.dirname(filename)
    pplfile = os.path.join(dirname, 'ppl.txt')
    if os.path.exists(filename):
        with open(pplfile, 'r') as f:
          lines = f.readlines()
        if len(lines):
            ppl = lines[-1].split()[-1]
            return float(ppl)
    return None

In [56]:
def GetParams(filename):
    dirname = os.path.dirname(filename)
    paramsfile = os.path.join(dirname, 'params.json')
    with open(paramsfile, 'r') as g:
        params = json.load(g)
    return params

In [131]:
dataframes = {}
for filename in glob.glob('exps/newstrat*/classify.csv'):
    print filename
    dataframes[filename] = MakePredictions(filename)

exps/newstrat2/classify.csv
exps/newstrat4/classify.csv
exps/newstrat5/classify.csv
exps/newstrat9/classify.csv
exps/newstrat8/classify.csv
exps/newstrat6/classify.csv
exps/newstrat10/classify.csv
exps/newstrat13/classify.csv
exps/newstrat11/classify.csv
exps/newstrat3/classify.csv
exps/newstrat7/classify.csv
exps/newstrat12/classify.csv
exps/newstrat1/classify.csv
exps/newstrat15/classify.csv
exps/newstrat14/classify.csv


In [132]:
def GetAuc(sub, df):
    fpr, tpr, thresholds = metrics.roc_curve(df.label == sub, -df[sub])
    return metrics.auc(fpr, tpr)

def PlotAuc(sub, df):
    fpr, tpr, thresholds = metrics.roc_curve(df.label == sub, -df[sub])
    pyplot.plot(fpr, tpr)

In [136]:
results = []
for filename in dataframes:
    row = {'model': filename}
    df = dataframes[filename]
    for sub in subreddits:
        row[sub] = GetAuc(sub, df)
    f1, acc = Metrics(list(df[df.label.isin(subreddits)].pred.values), 
                      list(df[df.label.isin(subreddits)].label.values), show=False)
    row['f1'] = f1
    row['acc'] = acc
    row['ppl'] = GetPPL(filename)
    
    params = GetParams(filename)
    row['hash'] = int(params['use_hash_table'])
    row['hyper'] = int(params['use_hyper_adaptation'])
    row['mikolov'] = int(params['use_mikolov_adaptation'])
    row['softmax'] = int(params['use_softmax_adaptation'])
    
    results.append(row)
results = pandas.DataFrame(results)
results['avg'] = results[subreddits].mean(axis=1)

results['delta_ppl'] = (71.75 - results.ppl) / results.ppl

z = results.sort_values('ppl', ascending=False)[['acc', 'f1', 'avg', 'ppl', 'delta_ppl', 'model',
                          'hyper', 'mikolov', 'softmax', 'hash']]
z

Unnamed: 0,acc,f1,avg,ppl,delta_ppl,model,hyper,mikolov,softmax,hash
14,0.416949,39.735424,0.764612,69.641,0.030284,exps/newstrat1/classify.csv,0,0,0,1
5,0.420339,40.645864,0.759036,69.439,0.033281,exps/newstrat2/classify.csv,1,0,0,0
11,0.417627,39.880366,0.767122,69.005,0.03978,exps/newstrat6/classify.csv,1,1,0,0
9,0.392542,38.513478,0.759267,68.783,0.043136,exps/newstrat10/classify.csv,1,0,1,0
2,0.383051,37.192194,0.761186,68.44,0.048364,exps/newstrat4/classify.csv,0,1,0,0
4,0.408136,39.393015,0.756887,68.342,0.049867,exps/newstrat14/classify.csv,1,1,1,0
6,0.395254,38.723171,0.755106,68.021,0.054821,exps/newstrat8/classify.csv,0,0,1,0
13,0.449492,43.612267,0.780663,67.998,0.055178,exps/newstrat3/classify.csv,1,0,0,1
7,0.386441,37.293691,0.75327,67.965,0.05569,exps/newstrat12/classify.csv,0,1,1,0
3,0.444746,43.512169,0.790765,67.508,0.062837,exps/newstrat7/classify.csv,1,1,0,1


In [137]:
df = dataframes['exps/newstrat11/classify.csv']
Metrics(list(df[df.label.isin(subreddits)].pred.values), list(df[df.label.isin(subreddits)].label.values));

accuracy = 0.463
 Lang     Prec.   Rec.   F1
------------------------------
  AskMen   30.87  31.72  31.29
  AskWomen   37.04  44.59  40.46
  Fitness   56.41  77.88  65.43
  askscience   32.10  81.25  46.02
  atheism   52.11  48.90  50.45
  changemyview   17.83  29.47  22.22
  politics   60.69  52.54  56.32
  worldnews   57.50  37.20  45.17
------------------------------
  Total:   43.07  50.44  44.67


In [138]:
def GetYesNo(x):
    if x == 1:
        return 'Y'
    return 'N'

baseline = 75.163

for i in range(len(z)):
    row = z.iloc[i]
    delta = (baseline - row.ppl) / baseline
    s = (GetYesNo(row.hyper), GetYesNo(row.mikolov), GetYesNo(row.softmax), GetYesNo(row.hash),
         '{0:.1f}'.format(row.ppl), '{0:.1f}\\%'.format(100 * delta), '{0:.1f}'.format(100 * row.avg))
    print ' & '.join(s) + ' \\\\'

N & N & N & Y & 69.6 & 7.3\% & 76.5 \\
Y & N & N & N & 69.4 & 7.6\% & 75.9 \\
Y & Y & N & N & 69.0 & 8.2\% & 76.7 \\
Y & N & Y & N & 68.8 & 8.5\% & 75.9 \\
N & Y & N & N & 68.4 & 8.9\% & 76.1 \\
Y & Y & Y & N & 68.3 & 9.1\% & 75.7 \\
N & N & Y & N & 68.0 & 9.5\% & 75.5 \\
Y & N & N & Y & 68.0 & 9.5\% & 78.1 \\
N & Y & Y & N & 68.0 & 9.6\% & 75.3 \\
Y & Y & N & Y & 67.5 & 10.2\% & 79.1 \\
Y & N & Y & Y & 67.2 & 10.6\% & 78.9 \\
Y & Y & Y & Y & 67.1 & 10.7\% & 79.2 \\
N & Y & N & Y & 66.9 & 11.0\% & 78.9 \\
N & N & Y & Y & 66.9 & 11.0\% & 78.4 \\
N & Y & Y & Y & 66.5 & 11.5\% & 78.4 \\


In [107]:
df = dataframes['exps/strat14/classify.csv']
Metrics(list(df[df.label.isin(subreddits)].pred.values), list(df[df.label.isin(subreddits)].label.values));

KeyError: 'exps/strat14/classify.csv'

In [10]:
for sub in subreddits:
    PlotAuc(sub, df)
pyplot.legend(subreddits, loc=4)
pyplot.show()

NameError: global name 'pyplot' is not defined

In [None]:
collections.Counter(zip(df.label, df.pred))

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df[subreddits].plot()
pyplot.show()

In [None]:
pyplot.plot(np.convolve(df.hockey, [1.0]*80))
pyplot.show()