# Read in results

In [1]:
from sklearn.metrics import roc_auc_score
import sys
sys.path.append('text_nn')
import pandas as pd
import rationale_net
from rationale_net.datasets.new_york_times_dataset import NYTimesDataset
import pickle
results = pickle.load(open("exploratory_analysis/logs/results__model-form_cnn__num-layers_1__word-cutoff_400__epochs_30", 'rb'))

In [2]:
pd.Series(results)

train                                                              True
test                                                               True
cuda                                                               True
num_gpus                                                              1
debug_mode                                                        False
class_balance                                                     False
objective                                                 cross_entropy
aspect                                                          overall
init_lr                                                          0.0001
epochs                                                               30
batch_size                                                           64
patience                                                              5
tuning_metric                                                      loss
save_dir                                                       s

In [3]:
results['train_data'][3]['text']

'lead : paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . lead : paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . paul vario , convicted gangster said leader lucchese crime family three decades , died lung failure tuesday fort worth federal prison , serving 10-year sentence extorting payoffs air - freight companies kennedy international airport . 73 years old . prison officer , alice d. davis , said mr. vario , ailing several years , found late afternoon floor cell minimum - security penitentiary . taken st. joseph hospital pronounced dead 5:04 . autopsy attributed death respiratory arrest resulting chronic obstructive lu

In [4]:
test_stats = pd.Series(results['test_stats'])

True ratio

In [5]:
pd.Series(results['test_stats']['golds']).value_counts()

0    246089
1     11379
dtype: int64

Predicted ratio

In [6]:
pd.Series(results['test_stats']['preds']).value_counts()

0    171564
1     85904
dtype: int64

In [7]:
y_proba = list(map(lambda x: x[1] if x[0] == 1 else 1-x[1], zip(test_stats['preds'], test_stats['probas'])))
y_true = results['test_stats']['golds']

ROC

In [8]:
roc_auc_score(y_true, y_proba)

0.7650085622044425

# Rationals

In [9]:
rationales = results['test_stats']['rationales']

In [10]:
def get_rationale_words(rationale):
    rational_words = rationale.split('_')
    rational_words = list(map(lambda x: x.strip(), rational_words))
    in_rationale = False
    ## 
    all_rationals = []
    rational_output = []
    for word in rational_words:
        if in_rationale:
            if word != '':
                rational_output.append(word)
            else:
                all_rationals.append(' '.join(rational_output))
                rational_output = []
                in_rationale = False
        else:
            if word != '':
                rational_output.append(word)
                in_rationale = True
    return all_rationals

In [11]:
rationale_words = pd.Series(rationales).apply(get_rationale_words)

In [12]:
(rationale_words
 .apply(lambda x: list(map(lambda y: str(len(y.split(' '))), x)))
 .str.join(':::').str.split(':::', expand=True)
 .unstack().dropna()
 .value_counts()
)

1    247818
       9736
2         3
dtype: int64

In [13]:
rational_df = pd.concat([
    rationale_words.to_frame('word_list'),
    pd.Series(y_true).to_frame('y_true'),
    pd.Series(results['test_stats']['preds']).to_frame('y_pred')
], axis=1)

Overall most popular rationales

In [14]:
(rational_df['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
.value_counts().head(10)
)

)            39508
(            21854
,            16901
officials    12339
-             9757
              9737
:             8861
--            6782
.             5728
bush          4251
dtype: int64

Popular two-word rationales

In [15]:
(rational_df['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .loc[lambda s: s.str.split().str.len()==2]
 .value_counts()
 .head(10)
)

( )      2
( nyt    1
dtype: int64

Popular rationales for y_pred = 0 (article != front page)

In [16]:
(rational_df
 .loc[lambda df: df['y_pred'] == 0]['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .value_counts().head(50)
)

)                39508
(                21854
,                16901
-                 9757
:                 8861
--                6778
                  6700
.                 5728
;                 3517
11                2412
/                 2179
bloomberg         1526
18                1224
13                1034
died               995
17                 980
international      949
23                 847
14                 837
editor             829
26                 784
22                 772
9                  754
12                 748
19                 744
chief              738
15                 735
pm                 658
16                 634
street             632
10                 597
reported           595
21                 581
20                 575
may                569
a.m.               556
27                 555
28                 552
29                 529
8                  524
24                 510
25                 496
7                  496
30         

Popular rationals for y_pred = 1 (article = front page)

In [17]:
(rational_df
 .loc[lambda df: df['y_pred'] == 1]['word_list']
 .str.join(':::')
 .str.split(':::', expand=True)
 .unstack().dropna()
 .value_counts().head(10)
)

officials         12339
bush               4251
administration     4174
experts            3169
workers            3110
                   3037
government         2873
leaders            1751
killed             1744
say                1522
dtype: int64