In [35]:
import pandas as pd 
import re
import ast

In [8]:
lines = []
with open('../modeling/output/edits__docs-add-sota.txt') as f:
    for line in f:
        lines.append(line.strip())

In [11]:
data_lines = list(filter(lambda x: 'num_added' in x, lines))

In [20]:
t = data_lines[0]

In [49]:
def parse_line(t):
    output = {}
    output['pred'] = int(t.split(']')[0].replace('[', ''))
    output['doc_id'] = ast.literal_eval(re.search('\(.*\)', t)[0])
    output['y_true'] = int(re.search(':(\d+)}', t)[1])
    return output

In [50]:
parsed_lines = list(map(parse_line, data_lines))

In [54]:
output_df = (
    pd.DataFrame(parsed_lines)
    .assign(y_true_bin=lambda df: pd.cut(df['y_true'], [0, 1, 5, 100], right=False))
)

In [63]:
output_df['pred'].value_counts()

1    26954
0    11509
2     1503
Name: pred, dtype: int64

In [95]:
mapper = {v:k for k,v in enumerate(output_df['y_true_bin'].value_counts().index)}

In [105]:
output_df = output_df.assign(y_true_bin_idx=lambda df: df['y_true_bin'].map(mapper))

In [106]:
output_df.pipe(lambda df: df['pred'] == df['y_true_bin_idx']).value_counts()

False    20537
True     19429
dtype: int64

In [107]:
output_df.pipe(lambda df: df['pred'] == df['y_true_bin_idx']).value_counts()

False    20537
True     19429
dtype: int64

In [109]:
from sklearn.metrics import f1_score

In [113]:
output_df.pipe(lambda df: f1_score(df['y_true_bin_idx'], df['pred'], average='macro'))

0.3508151185862447

In [114]:
output_df.pipe(lambda df: f1_score(df['y_true_bin_idx'], df['pred'], average='weighted'))

0.4703383870020178

In [115]:
output_df

Unnamed: 0,pred,doc_id,y_true,y_true_bin,y_true_bin_idx
0,1,"(1094629, 0)",5,"[5, 100)",2
1,1,"(1374375, 0)",0,"[0, 1)",0
2,1,"(4439, 0)",0,"[0, 1)",0
3,1,"(798964, 0)",0,"[0, 1)",0
4,0,"(593948, 2)",0,"[0, 1)",0
...,...,...,...,...,...
39961,1,"(339164, 0)",9,"[5, 100)",2
39962,1,"(340687, 0)",0,"[0, 1)",0
39963,1,"(546632, 1)",0,"[0, 1)",0
39964,1,"(336586, 2)",1,"[1, 5)",1


# See which categories score especially high

In [128]:
text_data = pd.read_csv('../modeling/data/doc-eval__add-balanced-large.csv')
text_data['sentence_split'] = text_data['sentences'].str.split('<SENT>')
text_data['num_sents'] = text_data['sentence_split'].str.len()

In [126]:
text_data['sentences']

0        D.C. police are investigating the death of an ...
1        President Donald Trump will visit Israel, the ...
2        WASHINGTON (AP) -- A U.S. counterterrorism air...
3        The Queen has officially opened the new PS2.5b...
4        Elephants at the Berlin zoo have finally been ...
                               ...                        
39995    Retailer JD Sports Fashion has reported a shar...
39996    Rachel Reeves will replace Liam Byrne as shado...
39997    A father who killed his six-year-old son by ju...
39998    Plans to increase free pre-school childcare ar...
39999    Asian markets were little changed as President...
Name: sentences, Length: 40000, dtype: object

In [130]:
text_data['num_sents']

0        6
1        8
2        8
3        6
4        7
        ..
39995    8
39996    9
39997    7
39998    7
39999    6
Name: num_sents, Length: 40000, dtype: int64

In [137]:
text_data

Unnamed: 0,entry_id,version,num_deleted,num_added,num_edited,num_refactored,sentences,sentence_split,num_sents
0,1094629,0,2,5,2,0,D.C. police are investigating the death of an ...,[D.C. police are investigating the death of an...,6
1,1374375,0,1,0,2,0,"President Donald Trump will visit Israel, the ...","[President Donald Trump will visit Israel, the...",8
2,4439,0,0,0,1,0,WASHINGTON (AP) -- A U.S. counterterrorism air...,[WASHINGTON (AP) -- A U.S. counterterrorism ai...,8
3,798964,0,0,0,1,0,The Queen has officially opened the new PS2.5b...,[The Queen has officially opened the new PS2.5...,6
4,593948,2,1,0,0,0,Elephants at the Berlin zoo have finally been ...,[Elephants at the Berlin zoo have finally been...,7
...,...,...,...,...,...,...,...,...,...
39995,385889,0,1,4,3,0,Retailer JD Sports Fashion has reported a shar...,[Retailer JD Sports Fashion has reported a sha...,8
39996,691830,2,0,2,0,0,Rachel Reeves will replace Liam Byrne as shado...,[Rachel Reeves will replace Liam Byrne as shad...,9
39997,223422,0,0,1,1,0,A father who killed his six-year-old son by ju...,[A father who killed his six-year-old son by j...,7
39998,631907,0,3,3,4,0,Plans to increase free pre-school childcare ar...,[Plans to increase free pre-school childcare a...,7


In [146]:
output_df = (output_df
 .assign(entry_id=lambda df: df['doc_id'].str.get(0))
 .assign(version=lambda df: df['doc_id'].str.get(1))
 .assign(correct=lambda df: df['pred'] == df['y_true_bin_idx'])
)

In [320]:
full_data_df = (text_data[['entry_id', 'version', 'sentences', 'num_sents']]
 .merge(
     output_df[['entry_id', 'version', 'correct', 'pred', 'y_true_bin_idx']],
     left_on=['entry_id', 'version'],
     right_on=['entry_id', 'version']
   )
)

In [365]:
print((output_df
 .groupby('y_true_bin')
 [['pred', 'y_true_bin_idx']]
 .aggregate(list)
 .apply(lambda x: f1_score(x['pred'], x['y_true_bin_idx'], average='weighted'), axis=1)
 .pipe(lambda s: s * 100).round(1)
 .to_latex()
))

\begin{tabular}{lr}
\toprule
{} &     0 \\
y\_true\_bin &       \\
\midrule
[0, 1)     &  16.2 \\
[1, 5)     &  59.7 \\
[5, 100)   &   0.9 \\
\bottomrule
\end{tabular}



In [322]:
full_data_df[['num_sents', 'correct']].corr()

Unnamed: 0,num_sents,correct
num_sents,1.0,0.082865
correct,0.082865,1.0


In [323]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [324]:
lda = LatentDirichletAllocation(n_components=20)

In [325]:
full_data_df['sentences_train'] = full_data_df['sentences'].str.replace('<SENT>', ' ')
cv = CountVectorizer(stop_words='english', max_df=.5, min_df=.005)
counts = cv.fit_transform(full_data_df['sentences_train'])

In [None]:
theta_vecs = lda.fit_transform(counts)

In [328]:
v = pd.Series(cv.vocabulary_).sort_values().index
beta_mat = pd.DataFrame(lda.components_, columns=v)
topics = {}
for t in beta_mat.T.columns:
    topics['topic %s' % t] = beta_mat.T[t].sort_values(ascending=False).iloc[:10].index.tolist()

In [329]:
top_words_df = pd.DataFrame(topics)

In [330]:
full_data_df['topic'] = pd.Series(theta_vecs.argmax(axis=1)).to_frame('topic')

In [331]:
top_topics = full_data_df['topic'].value_counts().loc[lambda s: s > 2000].drop(4)

In [332]:
top_topics

13    4111
5     3507
9     3051
2     2747
6     2606
7     2383
12    2355
18    2308
0     2038
Name: topic, dtype: int64

In [333]:
top_topics_idx = top_topics.index

In [334]:
top_topics_idx.shape 

(9,)

In [335]:
top_topics_str = list(map(lambda s: 'topic %s' % s, top_topics_idx.sort_values()))

In [366]:
top_words_df[top_topics_str].iloc[:10]

Unnamed: 0,topic 0,topic 2,topic 5,topic 6,topic 7,topic 9,topic 12,topic 13,topic 18
0,care,complaint,seats,away,radical,better,highest,instead,terrorism
1,iraqi,built,investigating,users,august,criticised,euros,names,airport
2,seen,58,2018,weapon,nuclear,badly,click,character,apartment
3,elected,suspected,established,produced,student,sheffield,firms,congress,tough
4,complete,lake,think,meetings,reporting,soldiers,vulnerable,interior,command
5,radio,search,soon,killed,profile,pair,brigade,tribute,claimed
6,area,appeared,spoke,individuals,missed,questioned,isis,path,urged
7,visiting,grounds,unlikely,fall,safe,seat,argued,corporate,ftse
8,hurt,error,gone,modern,resort,supermarket,plant,belgium,appropriate
9,firefighters,shots,egyptian,ali,regions,individual,accounts,art,declared


In [336]:
print(top_words_df[top_topics_str].iloc[:10].to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
     topic 0 &   topic 2 &       topic 5 &     topic 6 &   topic 7 &     topic 9 &   topic 12 &  topic 13 &    topic 18 \\
\midrule
        care & complaint &         seats &        away &   radical &      better &    highest &   instead &   terrorism \\
       iraqi &     built & investigating &       users &    august &  criticised &      euros &     names &     airport \\
        seen &        58 &          2018 &      weapon &   nuclear &       badly &      click & character &   apartment \\
     elected & suspected &   established &    produced &   student &   sheffield &      firms &  congress &       tough \\
    complete &      lake &         think &    meetings & reporting &    soldiers & vulnerable &  interior &     command \\
       radio &    search &          soon &      killed &   profile &        pair &    brigade &   tribute &     claimed \\
        area &  appeared &         spoke & individuals &    missed &  questioned &       isis 

In [341]:
full_data_df.head(2)

Unnamed: 0,entry_id,version,sentences,num_sents,correct,pred,y_true_bin_idx,sentences_train,topic
0,1094629,0,D.C. police are investigating the death of an ...,6,False,1,2,D.C. police are investigating the death of an ...,18
1,1374375,0,"President Donald Trump will visit Israel, the ...",8,False,1,0,"President Donald Trump will visit Israel, the ...",0


In [358]:
(full_data_df
     .groupby('topic')[['pred', 'y_true_bin_idx']]
     .aggregate(list)
     .apply(lambda x: f1_score(x['pred'], x['y_true_bin_idx'], average='weighted'), axis=1)
     .loc[top_topics.index]
     .sort_values(ascending=False)
     .pipe(lambda s: s* 100).round(1)
)

13    66.8
18    61.8
9     58.3
6     56.8
5     54.0
7     52.6
2     50.4
12    48.4
0     38.1
dtype: float64

In [237]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [None]:
train_test_split()

In [238]:
X_train, X_test, y_train, y_test = train_test_split(counts, full_data_df['correct'], test_size=0.1)

In [239]:
lr = LogisticRegressionCV(max_iter=10000)

In [240]:
lr.fit(X_train, y_train)

LogisticRegressionCV(max_iter=10000)

In [242]:
y_pred = lr.predict(X_test)

In [243]:
f1_score(y_test, y_pred)

0.5032661175802328

In [245]:
(y_test == y_pred).mean()

0.5624218163622717

In [256]:
pd.Series(lr.coef_[0], index=v).sort_values().iloc[:30]

investigation   -0.074995
president       -0.071079
statement       -0.057633
economy         -0.051985
thursday        -0.049940
editing         -0.048194
modified        -0.047253
monday          -0.046354
charges         -0.045667
mon             -0.044446
trump           -0.044306
court           -0.044103
nuclear         -0.043478
letters         -0.043239
price           -0.040826
warned          -0.040133
washington      -0.039905
step            -0.039183
months          -0.039172
bst             -0.038866
general         -0.038538
crimes          -0.038289
iran            -0.037819
reporting       -0.037671
nations         -0.037233
nov             -0.035905
support         -0.035694
2017            -0.035518
dead            -0.035299
sharing         -0.035195
dtype: float64

In [257]:
pd.Series(lr.coef_[0], index=v).sort_values().iloc[-30:]

story         0.035643
animal        0.035701
research      0.036971
set           0.037442
follows       0.039378
say           0.039549
video         0.039750
20            0.039914
closed        0.040688
address       0.040777
really        0.040954
war           0.041208
subscribe     0.041267
world         0.041975
sign          0.042221
brown         0.043239
photo         0.043668
twitter       0.047085
confirmed     0.048771
just          0.049882
enter         0.052065
scene         0.055275
press         0.057614
inbox         0.060993
film          0.063221
morning       0.070107
delivered     0.086384
editors       0.086730
commentary    0.087037
picks         0.089149
dtype: float64