In [21]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os

import matplotlib.patches as patches


def add_time_to_date(date, hour_diff):
    return datetime.datetime.combine(date, datetime.datetime.min.time())+datetime.timedelta(hours=hour_diff)


### ! need to change how displaying post window works
def plot_sim_timeseries(p_t, delta, pre, post, show_pre_post, show_means, show_delta, 
                            n_dp=0, include_day_before_in_post=False, alpha=0.3):
        # fig, ax = plt.subplots(figsize=(10,5))
        fig, ax = plt.subplots(figsize=(10,5))

        # basic plot of points
        ax.plot(p_t.index, p_t.values, c='k', marker='o')

        ax.set_xlabel('date')
        ax.set_ylabel('no. similar tweets')

        # pre post regions
        if show_pre_post:
                # shade all post region
                ax.axvspan(add_time_to_date(post.index[0], -12),
                        add_time_to_date(post.index[-1],  12),
                        alpha=alpha, label='post', color='tab:blue')

                if include_day_before_in_post:
                        release_day = post.index[1]
                else:
                        release_day = post.index[0]
                
                print(release_day)

                ax.axvspan(add_time_to_date(release_day, -12),
                        add_time_to_date(release_day,  12),
                        alpha=alpha, color='tab:blue')
                
                # if include_day_before_in_post: # if we're including the day before the release date in the post window
                #         if len(post) > 1:
                #                 ax.axvspan(add_time_to_date(post.index[1], -12),
                #                         add_time_to_date(post.index[-1],  12),
                #                         alpha=0.4, label='post', color='tab:blue')

                #         ax.axvspan(add_time_to_date(post.index[0], -12),
                #                 add_time_to_date(post.index[0],  12),
                #                 alpha=0.6, label='post (release date)', color='tab:blue')

                # else: # if we're not (i.e. day before release date is in the pre window)
                #         if len(post) > 1:
                #                 ax.axvspan(add_time_to_date(post.index[1], -12),
                #                         add_time_to_date(post.index[-1],  12),
                #                         alpha=0.4, label='post', color='tab:blue')

                #         ax.axvspan(add_time_to_date(post.index[0], -12),
                #                 add_time_to_date(post.index[0],  12),
                #                 alpha=0.6, label='post (release date)', color='tab:blue')

                # whether the day before the release date is in the pre window doesn't effect how we display pre window
                # only matters for the post window (different colour)
                ax.axvspan(add_time_to_date(pre.index[0], 0),
                        add_time_to_date(pre.index[-1],  12),
                        alpha=alpha, label='pre', color='tab:orange')

        if show_means:
                ax.plot([add_time_to_date(post.index[0], -12), add_time_to_date(post.index[-1], 12)],
                        [post.mean(), post.mean()],
                        c='tab:blue', linestyle='--', label='post mean')

                ax.plot([add_time_to_date(pre.index[0], 0), add_time_to_date(pre.index[-1], 12)],
                        [pre.mean(), pre.mean()],
                        c='tab:orange', linestyle='--', label='pre mean')

        # doesn't make sense to show delta without means
        if show_means and show_delta:
                arrow = patches.FancyArrowPatch((add_time_to_date(post.index[0],-12), pre.mean()), (add_time_to_date(post.index[0],-12), post.mean()), 
                                                arrowstyle='<->', mutation_scale=20, color='r')
                ax.add_artist(arrow)
                # ax.annotate('test', xy=(.46, .3), xycoords='figure fraction', color='r')
                ax.annotate(rf'$\Delta$={np.around(delta, n_dp)}', xy=(add_time_to_date(post.index[0],-10), (pre.mean()+post.mean())/2 ), color='r', fontsize=12)

        ax.set_ylim(0,)

        plt.legend()
        return fig, ax

In [3]:
metadata_fname = 'press_releases_embedded.pkl'
metadata = pd.read_pickle(metadata_fname)

In [4]:
fnames = [f for f in os.listdir() if f.endswith('pkl') and f != metadata_fname]

pr_id_to_fname = {int(f.split('.')[0].split('_')[1]) : f for f in fnames} # string id ('137') : fname ('pr_137.pkl')
title_to_pr_id = {metadata.loc[i, 'title'] : i for i in pr_id_to_fname.keys()}

In [25]:
pre_n=7
post_n=3
include_day_before_in_post=False

t = 0.7
# data_t = data.loc[data.sim > t]
# n_t = data_t.date.value_counts(dropna=False).reindex(pd.Index(data.date.unique()), fill_value=0)

# delta, pre, post = get_delta(n_t, release_date=pd.to_datetime('2020-12-12').date(), 
#                             pre_n=pre_n, post_n=post_n, include_day_before_in_post=include_day_before_in_post)

# fig, ax = plot_sim_timeseries(n_t, delta, pre, post, show_pre_post=True, show_means=True, show_delta=True, 
#                             n_dp=0, include_day_before_in_post=True, alpha=0.2)

In [22]:
def get_delta(s, # pandas series of relevant values (p_t / mean_sim / n_t)
    release_date, #datetime.datetime.date() of release
    pre_n, post_n, # n days in pre/post regions (release date included in post region)
    include_day_before_in_post=False
    ):
    
    text_date_ind = s.index.get_loc(release_date)

    if include_day_before_in_post:
        pre = s.iloc[text_date_ind-1-pre_n : text_date_ind-1]
        post = s.iloc[text_date_ind-1 : text_date_ind-1+post_n]
    else:
        pre = s.iloc[text_date_ind-pre_n : text_date_ind]
        post = s.iloc[text_date_ind : text_date_ind+post_n]

    # print(len(pre), len(post))

    delta = post.mean() - pre.mean()
    return delta, pre, post

In [28]:
deltas = pd.Series([], name='deltas')

for i, fname in enumerate(fnames):
    data = pd.read_pickle(fname).set_index('tweet_id')
    data_t = data.loc[data.sim > t]
    timeseries = data_t.date.value_counts().reindex(pd.Index(data.date.unique()), fill_value=0)

    # print(timeseries.index[0])

    pr_id = int(fname.split('.')[0].split('_')[1])

    # st.write(fname)
    # st.write(timeseries)
    # st.write(pr_id)
    # st.write(str(metadata.loc[pr_id]))

    delta, pre, post = get_delta(timeseries, release_date=metadata.loc[int(pr_id), 'date'].date(), 
                            pre_n=pre_n, post_n=post_n, include_day_before_in_post=include_day_before_in_post)

    # print(delta, pre.mean(), post.mean())

    deltas[pr_id] = delta

  deltas = pd.Series([], name='deltas')


In [31]:
metadata['delta'] = deltas

In [32]:
metadata

Unnamed: 0,title,org,date,url,text,sents,sentence_embeddings,embedding,delta
0,mystery sun magnetic waves,uoe,2019-12-02,https://www.exeter.ac.uk/news/archive/2019/dec...,Scientists crack 60-year-old mystery of Sun’s ...,[Scientists crack 60-year-old mystery of Sun’s...,"[[-0.10568396, 0.059343025, 0.05864158, 0.0884...","[-0.033898428, 0.026834546, 0.030490704, 0.062...",
1,killer whale grandmothers,uoe,2019-12-09,https://www.exeter.ac.uk/news/archive/2019/dec...,Killer whale grandmothers boost survival of ca...,[Killer whale grandmothers boost survival of c...,"[[-0.0056566903, 0.004876952, 0.056675207, 0.0...","[0.008576699, 0.020933418, 0.038221776, 0.0601...",
2,cop25 politics polluted,gp,2019-12-15,https://www.greenpeace.org/international/press...,COP25: The politics are polluted Greenpeace In...,[COP25: The politics are polluted Greenpeace I...,"[[-0.0134424325, 0.020380098, 0.055334758, 0.0...","[0.011437814, 0.027482465, 0.03378036, 0.03082...",395.809524
3,surface clean tech,uoe,2020-08-04,https://www.exeter.ac.uk/news/archive/2020/aug...,Surface clean-up technology won't solve ocean ...,[Surface clean-up technology won't solve ocean...,"[[-0.096479684, 0.067017205, 0.16345368, -0.07...","[-0.044458717, 0.064701036, 0.08490751, -0.004...",
4,teletext court action,uk_gov,2021-04-30,https://www.gov.uk/government/news/teletext-fa...,Teletext faces court action unless it pays ove...,[Teletext faces court action unless it pays ov...,"[[-0.0065774512, 0.006286746, 0.08180707, -0.0...","[-0.014733358, 0.020859087, 0.048258767, 0.010...",
...,...,...,...,...,...,...,...,...,...
171,sharks rays extinction,uoe,2021-01-27,https://www.exeter.ac.uk/news/archive/2021/jan...,Ocean sharks and rays threatened with extincti...,[Ocean sharks and rays threatened with extinct...,"[[-0.013585791, 0.031706974, 0.00849287, 0.062...","[-0.0058359383, 0.025509706, 0.029808464, 0.03...",
172,verdict against shell,gp,2021-05-26,https://www.greenpeace.org/international/press...,Historic verdict in climate case against Shell...,[Historic verdict in climate case against Shel...,"[[0.025918443, 0.1768673, 0.100390956, 0.01325...","[0.005138757, 0.0810865, 0.048456483, 0.014504...",2210.285714
173,ebola guinea,uk_gov,2021-02-26,https://www.gov.uk/government/news/uk-statemen...,UK statement on the re-emergence of Ebola in G...,[UK statement on the re-emergence of Ebola in ...,"[[0.046359424, 0.03916658, 0.029954178, -0.029...","[0.034800135, 0.010851145, 0.014420311, -0.023...",
174,election manifesto lengths,uoe,2019-11-21,https://www.exeter.ac.uk/news/archive/2019/nov...,Growing length of manifestos casts new light o...,[Growing length of manifestos casts new light ...,"[[-0.0018704333, 0.044737127, 0.051276054, -0....","[0.033707112, 0.023435311, -0.004913721, -0.03...",


In [29]:
deltas

137     7595.428571
172     2210.285714
2        395.809524
33       185.428571
77     20233.761905
8        581.238095
Name: deltas, dtype: float64