# Read in results

In [1]:
from sklearn.metrics import roc_auc_score
import sys
sys.path.append('text_nn')
import pandas as pd
import rationale_net
from rationale_net.datasets.new_york_times_dataset import NYTimesDataset
import pickle
results = pickle.load(open("exploratory_analysis/logs/results__model-form_cnn__num-layers_1__word-cutoff_800__epochs_30", 'rb'))

In [2]:
pd.Series(results)

train                                                              True
test                                                               True
cuda                                                               True
num_gpus                                                              1
debug_mode                                                        False
class_balance                                                     False
objective                                                 cross_entropy
aspect                                                          overall
init_lr                                                          0.0001
epochs                                                               30
batch_size                                                           64
patience                                                              5
tuning_metric                                                      loss
save_dir                                                       s

In [63]:
results['train_data'][3]['text']

'lead : paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . lead : paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . 73 years old . prison officer , alice d. davis , said mr. vario , ailing several years , found late afternoon floor cell minimum - security penitentiary . taken st. joseph hospital pronounced dead 5:04 . autopsy attributed death respiratory arrest resulting chronic obstructive lu

In [4]:
test_stats = pd.Series(results['test_stats'])

True ratio

In [5]:
pd.Series(results['test_stats']['golds']).value_counts()

0    246089
1     11379
dtype: int64

Predicted ratio

In [6]:
pd.Series(results['test_stats']['preds']).value_counts()

0    188186
1     69282
dtype: int64

In [7]:
y_proba = list(map(lambda x: x[1] if x[0] == 1 else 1-x[1], zip(test_stats['preds'], test_stats['probas'])))
y_true = results['test_stats']['golds']

ROC

In [8]:
roc_auc_score(y_true, y_proba)

0.77327062773741

# Rationals

In [9]:
rationales = results['test_stats']['rationales']

In [30]:
def get_rationale_words(rationale):
    rational_words = rationale.split('_')
    rational_words = list(map(lambda x: x.strip(), rational_words))
    in_rationale = False
    ## 
    all_rationals = []
    rational_output = []
    for word in rational_words:
        if in_rationale:
            if word != '':
                rational_output.append(word)
            else:
                all_rationals.append(' '.join(rational_output))
                rational_output = []
                in_rationale = False
        else:
            if word != '':
                rational_output.append(word)
                in_rationale = True
    return all_rationals

In [32]:
rationale_words = pd.Series(rationales).apply(get_rationale_words)

In [40]:
(rationale_words
 .apply(lambda x: list(map(lambda y: str(len(y.split(' '))), x)))
 .str.join(' ').str.split(' ', expand=True)
 .unstack().dropna()
 .value_counts()
)

1    252597
       4979
2         3
dtype: int64

In [41]:
rational_df = pd.concat([
    rationale_words.to_frame('word_list'),
    pd.Series(y_true).to_frame('y_true'),
    pd.Series(results['test_stats']['preds']).to_frame('y_pred')
], axis=1)

Overall most popular rationales

In [56]:
(rational_df['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
.value_counts().head(10)
)

(                 35870
:                 21409
)                 20459
,                 16027
old               12019
--                 9434
bush               7525
administration     7495
;                  6779
-                  5641
dtype: int64

Popular two-word rationales

In [57]:
(rational_df['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .loc[lambda s: s.str.split().str.len()==2]
 .value_counts()
 .head(10)
)

( nyt    2
( )      1
dtype: int64

Popular rationales for y_pred = 0 (article != front page)

In [61]:
(rational_df
 .loc[lambda df: df['y_pred'] == 0]['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .value_counts().head(50)
)

(              35870
:              21409
)              20459
,              16027
old            12019
--              9434
;               6779
-               5641
                4834
.               4542
nyt             3665
$               2562
editor          1564
/               1409
million         1276
bloomberg       1101
street           751
cents            694
billion          672
born             596
chief            523
john             441
robert           403
's               382
former           354
analysts         336
died             315
new              311
department       301
corporation      299
us               290
9                285
c.               283
william          279
city             277
county           276
&                275
state            268
percent          256
inc.             237
law              214
george           208
agency           207
s.               199
civil            191
reported         184
johnson          183
corzine      

Popular rationals for y_pred = 1 (article = front page)

In [59]:
(rational_df
 .loc[lambda df: df['y_pred'] == 1]['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .value_counts().head(10)
)

bush              7525
administration    7495
government        5034
federal           4650
officials         3590
say               2256
people            1413
workers           1365
experts            849
lawmakers          795
dtype: int64