In [1]:
import sys
sys.path.append('../')

In [73]:
import util.util_newssniffer_parsing as unp
import pandas as pd
import sqlite3
from importlib import reload
from IPython.display import display, HTML
reload(unp)

<module 'util.util_newssniffer_parsing' from '../util/util_newssniffer_parsing.py'>

In [135]:
join_keys_matched = ['entry_id', 'version_x', 'sent_idx_x']

# Read in Data

In [4]:
sent_df = pd.read_pickle('df_nyt__start_0__end_20000__num_1.pkl', compression='gzip')
matched_versions_df = pd.read_pickle('2021-05-03__nyt-spark-matched-data-cache.pkl')
headline_df = pd.read_pickle('df_nyt__headlines.pkl')
matched_keys = matched_versions_df[join_keys_matched].drop_duplicates()

In [116]:
## get mean of all sentence distances throughout article to find ones that have no changes
matched_means = (matched_versions_df
     .assign(avg_sentence_distance_x=lambda df: df['avg_sentence_distance_x'].fillna(1)) ## make added and removed
     .assign(avg_sentence_distance_y=lambda df: df['avg_sentence_distance_y'].fillna(1)) ## sentences have a high change value
     .groupby(['entry_id', 'version_x', 'version_y'])
     [['avg_sentence_distance_x', 'avg_sentence_distance_y']]
     .mean()
     .mean(axis=1)
     .to_frame('mean_score')
     .reset_index()
)

In [117]:
matched_keys = (matched_keys
 .merge(matched_means,
        right_on=join_keys_matched,
        left_on=join_keys_matched
       )
 .loc[lambda df: df['mean_score'] != 0] ## make sure we're not considering articles with no changes at all (why do these exist?)
 .drop('mean_score', axis=1)
 )

In [5]:
sent_df.head(3)

Unnamed: 0,entry_id,version,sent_idx,sentence
0,777889,0,0,"PARIS — He sat in the car, frozen with fear, a..."
1,777889,0,1,They were rushing to the king’s birthday party...
2,777889,0,2,"It was the summer of 1971, and the Moroccan Ar..."


In [37]:
headline_df.head(3)

Unnamed: 0,entry_id,version,title
0,547988,0,Activist Challenges a Sweeping Revision in Pat...
1,547988,1,Inventor Challenges a Sweeping Revision in Pat...
2,547989,0,U.S. Foreign Arms Sales Reach $66.3 Billion in...


In [8]:
matched_versions_df.head(2)

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,704716,4,5,8.0,8.0,0.136447,0.136447
1,694571,6,7,,8.0,,


In [11]:
matched_keys.head(3)

Unnamed: 0,entry_id,version_x,version_y
0,704716,4,5
1,694571,6,7
2,703260,0,1


# Find a sample article with several version changes and a headline change

In [140]:
matched_headlines = (matched_keys
 .merge(
    headline_df,
    left_on=['entry_id', 'version_x'],
    right_on=['entry_id', 'version']
 )
 .drop('version', axis=1)
 .merge(
     headline_df,
     left_on=['entry_id', 'version_y'],
     right_on=['entry_id', 'version']
 )
 .drop('version', axis=1)
)

In [146]:
entry_ids_with_changed_heads = (
    matched_headlines
     .loc[lambda df: df['title_x']  != df['title_y'] ]
    ['entry_id'].values
)

In [166]:
## all entry_ids with 3 changed versions and a headline change

sample_entry_id = (matched_keys['entry_id']
 .value_counts()
 .loc[lambda s: s == 3 ] 
 .loc[lambda s: s.index.isin(entry_ids_with_changed_heads)]
 .index[1]
)

# View One Article through multiple versions 

In [167]:
# compare sentences
comp_sents = (matched_versions_df
 .loc[lambda df: df['entry_id'] == sample_entry_id]
 .merge(
    sent_df,
    left_on=['entry_id', 'version_x', 'sent_idx_x'],
    right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx',], axis=1)
 .merge(
    sent_df, 
    left_on=['entry_id', 'version_y', 'sent_idx_y'],
    right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 )
 .drop(['version', 'sent_idx',], axis=1)
 .sort_values(['sent_idx_x', 'sent_idx_y'])
)

# compare headlines
comp_heads = (
    headline_df
    .loc[lambda df: df['entry_id'] == sample_entry_id]
    .merge(
        headline_df
        .loc[lambda df: df['entry_id'] == sample_entry_id],
        left_on=['entry_id'],
        right_on=['entry_id']        
    )
)

In [168]:
html_outputs = unp.html_compare_sentences_new(
    comp_sents, 
    head_comp_df=comp_heads,
    sort_by='old',
    include_headline=True, 
)

In [169]:
display(HTML(html_outputs[0]))

SIdx Old,Old Version,New Version,SIdx New,d(X),d(Y)
HEAD,Houthis ’ Absence Delays Yemen Cease - Fire Talks,"U.N. Leader Calls Yemen a ‘ Ticking Bomb , ’ Amid Delay in Cease - Fire Talks",HEAD,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,"GENEVA — United Nations efforts to start talks aimed at achieving at least a cease - fire in Yemen were further delayed on Monday because members of the Houthi rebel group battling the Saudi - backed government of the president in exile , Abdu Rabbu Mansour Hadi , had not arrived in Geneva , where the talks were due to be held .",,,,
1.0,"In Geneva on Monday , Secretary General Ban Ki - moon met representatives of Yemen ’s ousted government and ambassadors from the so - called Group of 16 countries that is closely monitoring developments in Yemen , Ahmad Fawzi , a spokesman for the United Nations in Geneva , said .",,,,
2.0,"Mr. Ban “ is very concerned about the humanitarian situation in Yemen in particular and is calling for a humanitarian pause , ” Mr. Fawzi said , but he added that a formal start to what the United Nations is calling “ preliminary inclusive consultations ” would have to await the arrival of the Houthi delegation .",,,,
3.0,The consultations had been due to start on Sunday but were delayed when the Houthi delegation declined to board the aircraft that was supposed to take them to Geneva .,,,,
4.0,"The delegation , reported by The Associated Press to include Houthi leaders and representatives of former President Ali Abdullah Saleh , eventually left the Yemeni capital , Sana , on Sunday and flew to Djibouti , but the onward journey to Geneva was delayed by what Mr. Fawzi said was “ a technical glitch ” linked to refueling and flight crew schedules .",,,,
5.0,"As Mr. Ban started his meetings in Geneva , Mr. Fawzi said it was unclear whether the Houthi delegation had left Djibouti .",,,,
6.0,"The United Nations is hoping the consultations , which are due to start with “ proximity talks ” in which the delegations remain in separate rooms , will lead to a humanitarian pause to allow aid deliveries to the population and a withdrawal of fighting forces from cities .",,,,
7.0,"But as delegates and diplomats assembled in Geneva at the weekend , fierce fighting continued in many areas of Yemen , an impoverished nation of 26 million .","As delegates and diplomats assembled in Geneva over the weekend , fierce fighting continued in many areas of Yemen , a nation of 26 million .",12.0,0.26,0.26
8.0,"Airstrikes by the Saudi - led coalition continued at the weekend against rebel positions , including in the northern province of Saada , the central city of Dhamar and in the province of Jawf in the north , where the Houthis took control of the provincial capital on Sunday .","Airstrikes by the Saudi - led coalition continued against rebel positions , including in the northern province of Saada , the central city of Dhamar and in the province of Jawf in the northwest , where the Houthis took control of the provincial capital on Sunday .",13.0,0.3,0.3
9.0,"The United Nations high commissioner for human rights , Zeid Ra’ad al - Hussein , underscored the urgent need for a cease - fire to ease the plight of civilians facing what aid agencies have warned is a “ catastrophic ” humanitarian situation .","The United Nations High Commissioner for Human Rights , Zeid Ra’ad al - Hussein , highlighted the urgent need for a cease - fire to ease the plight of civilians facing what aid agencies have warned is a “ catastrophic ” humanitarian situation .",14.0,0.2,0.2


In [170]:
display(HTML(html_outputs[1]))

SIdx Old,Old Version,New Version,SIdx New,d(X),d(Y)
HEAD,"U.N. Leader Calls Yemen a ‘ Ticking Bomb , ’ Amid Delay in Cease - Fire Talks","U.N. Leader Calls Yemen a ‘ Ticking Bomb , ’ Amid Delay in Cease - Fire Talks",HEAD,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,"GENEVA — The United Nations secretary general , Ban Ki - moon , warned that Yemen was a “ ticking bomb , ” as efforts to start talks aimed at achieving at least a cease - fire in the country were further delayed on Monday .","GENEVA — The United Nations secretary general , Ban Ki - moon , warned that Yemen was a “ ticking bomb , ” as efforts to start talks aimed at achieving at least a cease - fire in the country were further delayed on Monday .",0,0.0,0.0
1.0,"Mr. Ban had stopped off in Geneva to attend the start of what the United Nations has called “ preliminary inclusive consultations , ” originally intended to start on Sunday , between the Houthi rebel group and the Saudi - backed government of President Abdu Rabbu Mansour Hadi , who is in exile .","Mr. Ban had stopped off in Geneva to attend the start of what the United Nations has called “ preliminary inclusive consultations , ” originally intended to start on Sunday , between the Houthi rebel group and the Saudi - backed government of President Abdu Rabbu Mansour Hadi , who is in exile .",1,0.0,0.0
2.0,"Mr. Ban met separately on Monday with representatives of the ousted government and ambassadors of the so - called Group of 16 countries that are closely monitoring developments in Yemen , but he had to leave for New York without seeing the Houthi delegation after delays on their journey to Geneva .","Mr. Ban met separately on Monday with representatives of the ousted government and ambassadors of the so - called Group of 16 countries that are closely monitoring developments in Yemen , but he had to leave for New York without seeing the Houthi delegation after delays on their journey to Geneva .",2,0.0,0.0
3.0,"The 23-member Houthi - led delegation , which was understood to also include representatives of former President Ali Abdullah Saleh , initially refused to board the aircraft that was supposed to bring them to Geneva .","The 23-member Houthi - led delegation , which was understood to also include representatives of former President Ali Abdullah Saleh , initially refused to board the aircraft that was supposed to bring them to Geneva .",3,0.0,0.0
4.0,"When they eventually left Sana on Sunday , the flight was delayed in Djibouti for what Mr. Ban said were logistical problems .","When they eventually left Yemen ’s capital , Sana , on Sunday , the flight was delayed in Djibouti because of what Mr. Ban said were logistical problems .",4,0.26,0.26
5.0,"Mr. Ban told reporters before leaving that the Houthi delegation was scheduled to reach Geneva later in the day , and he warned the parties that quick work toward an agreement was imperative .","Mr. Ban told reporters before leaving that the Houthi delegation was scheduled to reach Geneva later in the day , and he warned the parties that quick work toward an agreement was imperative .",5,0.0,0.0
6.0,“ Yemen ’s very existence hangs in the balance .,“ Yemen ’s very existence hangs in the balance .,6,0.0,0.0
7.0,"While parties bicker , Yemen burns , ” he told reporters .","While parties bicker , Yemen burns , ” he told reporters .",7,0.0,0.0
8.0,“ We do n’t have a minute to lose .,“ We do n’t have a minute to lose .,8,0.0,0.0
9.0,The ticking clock is not a timepiece it ’s a ticking bomb .,The ticking clock is not a timepiece it ’s a ticking bomb .,9,0.0,0.0


In [171]:
display(HTML(html_outputs[2]))

SIdx Old,Old Version,New Version,SIdx New,d(X),d(Y)
HEAD,"U.N. Leader Calls Yemen a ‘ Ticking Bomb , ’ Amid Delay in Cease - Fire Talks","U.N. Leader Calls Yemen a ‘ Ticking Bomb , ’ Amid Delay in Cease - Fire Talks",HEAD,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,"GENEVA — The United Nations secretary general , Ban Ki - moon , warned that Yemen was a “ ticking bomb , ” as efforts to start talks aimed at achieving at least a cease - fire in the country were further delayed on Monday .","GENEVA — The United Nations secretary general , Ban Ki - moon , warned that Yemen was a “ ticking bomb , ” as efforts to start talks aimed at achieving at least a cease - fire in the country were further delayed on Monday .",0,0.0,0.0
1.0,"Mr. Ban had stopped off in Geneva to attend the start of what the United Nations has called “ preliminary inclusive consultations , ” originally intended to start on Sunday , between the Houthi rebel group and the Saudi - backed government of President Abdu Rabbu Mansour Hadi , who is in exile .","Mr. Ban had stopped off in Geneva to attend the start of what the United Nations has called “ preliminary inclusive consultations , ” originally intended to start on Sunday , between the Houthi rebel group and the Saudi - backed government of President Abdu Rabbu Mansour Hadi , who is in exile .",1,0.0,0.0
2.0,"Mr. Ban met separately on Monday with representatives of the ousted government and ambassadors of the so - called Group of 16 countries that are closely monitoring developments in Yemen , but he had to leave for New York without seeing the Houthi delegation after delays on their journey to Geneva .","Mr. Ban met separately on Monday with representatives of the ousted government and ambassadors of the so - called Group of 16 countries that are closely monitoring developments in Yemen , but he had to leave for New York without seeing the Houthi delegation after delays on their journey to Geneva .",2,0.0,0.0
3.0,"The 23-member Houthi - led delegation , which was understood to also include representatives of former President Ali Abdullah Saleh , initially refused to board the aircraft that was supposed to bring them to Geneva .","The 23-member Houthi - led delegation , which was understood to also include representatives of former President Ali Abdullah Saleh , initially refused to board the aircraft that was supposed to bring them to Geneva .",3,0.0,0.0
4.0,"When they eventually left Yemen ’s capital , Sana , on Sunday , the flight was delayed in Djibouti because of what Mr. Ban said were logistical problems .","When they eventually left Yemen ’s capital , Sana , on Sunday , the flight was delayed in Djibouti because of what Mr. Ban said were logistical problems .",4,0.0,0.0
5.0,"Mr. Ban told reporters before leaving that the Houthi delegation was scheduled to reach Geneva later in the day , and he warned the parties that quick work toward an agreement was imperative .","Mr. Ban told reporters before leaving that the Houthi delegation was scheduled to reach Geneva later in the day , and he warned the parties that quick work toward an agreement was imperative .",5,0.0,0.0
6.0,“ Yemen ’s very existence hangs in the balance .,“ Yemen ’s very existence hangs in the balance .,6,0.0,0.0
7.0,"While parties bicker , Yemen burns , ” he told reporters .","While parties bicker , Yemen burns , ” he told reporters .",7,0.0,0.0
8.0,“ We do n’t have a minute to lose .,“ We do n’t have a minute to lose .,8,0.0,0.0
9.0,The ticking clock is not a timepiece it ’s a ticking bomb .,The ticking clock is not a timepiece it ’s a ticking bomb .,9,0.0,0.0
