In [42]:
from notebook_utils import resolve_paths_from_parent_directory
resolve_paths_from_parent_directory()
# auto reload notebook deps
%reload_ext autoreload
%autoreload 2

In [43]:
import json
import pandas as pd

DATASETS = {
    "CentralParkNYC": {
        "input_location": "../data/tweet_objects/CentralParkNYC/2021-01-27-2021-02-06.json",
        "output_location": "../data/labeled_datasets/CentralParkNYC-2021-01-27-2021-02-06.json",
        "spike_start": "Feb 1, 2021, 17:00",
        "spike_end": "Feb 3, 2021, 04:00",
    },
    "united": {
        "input_location": "../data/tweet_objects/united/2020-12-05-2020-12-15.json",
        "output_location": "../data/labeled_datasets/united-2020-12-05-2020-12-15.json",
        "spike_start": "Dec 12, 2020, 05:00",
        "spike_end": "Dec 13, 2020, 06:00",
    },
    "nationalGridUS": {
        "input_location": "../data/tweet_objects/nationalGridUS/2020-10-01-2020-10-12.json",
        "output_location": "../data/labeled_datasets/nationalGridUS-2020-10-01-2020-10-12.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_gareth_bale": {
        "input_location": "../data/tweet_objects/soccer_gareth_bale/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/soccer_gareth_bale.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_raphael_varane": {
        "input_location": "../data/tweet_objects/soccer_raphael_varane/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/soccer_raphael_varane.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_martin_garrix": {
        "input_location": "../data/tweet_objects/celebrity_martin_garrix/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_martin_garrix.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_jk_rowling": {
        "input_location": "../data/tweet_objects/celebrity_jk_rowling/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_jk_rowling.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    }
}

selected_dataset = DATASETS["celebrity_jk_rowling"]
labeled_spike_start = pd.to_datetime(selected_dataset["spike_start"]).tz_localize("UTC")
labeled_spike_end = pd.to_datetime(selected_dataset["spike_end"]).tz_localize("UTC")

In [44]:
from utils.dataset import create_tweet_df

with open(selected_dataset["input_location"], "r") as f:
    raw_dataset = json.load(f)

df_tweets = create_tweet_df(raw_dataset["tweets"])

# lower entities
df_tweets["hashtags"] = df_tweets.hashtags.apply(lambda xs: [x.lower() for x in xs])
df_tweets["mentions"] = df_tweets.mentions.apply(lambda xs: [x.lower() for x in xs])

# Set retweet count 0 for retweets
df_tweets["retweet_count"] = df_tweets.apply(
    lambda x: x.retweet_count if x.retweeted is None else 0, 
    axis=1
)

# Label tweets
df_tweets["is_anomaly"] = df_tweets.created_at.apply(
    lambda x: 0 if (x < labeled_spike_start or x > labeled_spike_end) else 1
)

df_tweets.head()

Unnamed: 0,id,text,created_at,hashtags,mentions,in_reply_to_user_id,user_id,retweet_count,quote_count,reply_count,like_count,retweeted,replied_to,quoted,is_anomaly
36348,1344795952796860417,En 2021 le real a encaissé 0 buts merci @rapha...,2021-01-01 00:01:54+00:00,[],[raphaelvarane],,1099366337435049984,0,0,0,1,,,,0
36347,1344841599470108674,"@realmadridindo1 Lawan Man City lah, gila aja....",2021-01-01 03:03:17+00:00,[],"[realmadridindo1, raphaelvarane]",548467026.0,993459134560284673,0,0,0,0,,1.3448226231427277e+18,,0
36346,1344844899045109760,RT @fajarkharisma__: @realmadridindo1 Lawan Ma...,2021-01-01 03:16:24+00:00,[],"[fajarkharisma__, realmadridindo1, raphaelvarane]",,548467026,0,0,0,0,1.3448415994701087e+18,,,0
36345,1344974007368617984,ojalá este 2021 nos reencontremos @raphaelvara...,2021-01-01 11:49:26+00:00,[],[raphaelvarane],,1550567730,0,0,0,0,,,,0
36344,1345007997421047808,@Rishayen1 @Jonaluga @josemanuelguir @As_Tomas...,2021-01-01 14:04:30+00:00,[],"[jonaluga, as_tomasroncero, marcelom12, cristi...",1.0994168634177454e+18,1315978022990815233,0,0,0,0,,1.3449184501798666e+18,,0


In [45]:
## Create timeseries from tweets
from utils.dataset import count_array_column

df_top_hashtags = count_array_column(df_tweets["hashtags"])
df_top_mentions = count_array_column(df_tweets["mentions"])

df_top_hashtags[:5]

Unnamed: 0,value,count,pct
2,halamadrid,8864,0.243858
166,huescarealmadrid,4313,0.118655
198,emirates,2225,0.061212
4,rmliga,2004,0.055132
19,fifa21,1564,0.043027


In [46]:
def count_col_occurrence(df_col, value):
    return df_col.apply(lambda values: value in values).sum()

time_bucket_size = "300Min"

df_timeseries = df_tweets.groupby(df_tweets.created_at.dt.ceil(time_bucket_size)).agg(
    total_count=('id', 'count'), 
    is_anomaly=('is_anomaly', lambda x: x.any()),
    retweet_count=('retweeted', lambda x: pd.notna(x).sum()),
    quote_count=('quoted', lambda x: pd.notna(x).sum()),
    replied_to_count=('replied_to', lambda x: pd.notna(x).sum()),
    top1_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[0].value)
    ),
    top2_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[1].value)
    ),
    top3_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[2].value)
    ),
    top1_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[0].value)
    ),
    top2_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[1].value)
    ),
    top3_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[2].value)
    )
)
df_timeseries.head()

Unnamed: 0_level_0,total_count,is_anomaly,retweet_count,quote_count,replied_to_count,top1_hashtag_count,top2_hashtag_count,top3_hashtag_count,top1_mention_count,top2_mention_count,top3_mention_count
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 03:00:00+00:00,1,False,0,0,0,0,0,0,1,0,0
2021-01-01 08:00:00+00:00,2,False,1,0,1,0,0,0,2,0,0
2021-01-01 13:00:00+00:00,1,False,0,0,0,0,0,0,1,0,0
2021-01-01 18:00:00+00:00,117,False,87,0,27,0,0,0,116,1,0
2021-01-01 23:00:00+00:00,66,False,47,1,17,4,0,0,66,2,0


In [47]:
from merlion.utils import TimeSeries
from merlion.models.anomaly.forecast_based.prophet import ProphetDetector, ProphetDetectorConfig
from merlion.plot import plot_anoms_plotly
from merlion.post_process.threshold import Threshold

def convert_to_merlion(df, column):
    df = df.copy()
    df = df[column]
    df.index = df.index.tz_convert(None)
    return TimeSeries.from_pd(df)

def detect_anomalies(
    df_timeseries, 
    column,
    model=ProphetDetector(ProphetDetectorConfig(
        threshold=Threshold(alm_threshold=0.5, abs_score=False),
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
        add_seasonality=False,
        uncertainty_samples=500,
    )),
    plot=True
):
    train_data = convert_to_merlion(df_timeseries, column)
    anomaly_score = model.train(train_data=train_data, anomaly_labels=None)
    scores = model.get_anomaly_score(train_data)
    df_scores = scores.to_pd()
    labels_train = model.get_anomaly_label(train_data)

    fig = model.plot_anomaly_plotly(
        time_series=train_data,
        plot_forecast=True,
        plot_forecast_uncertainty=True
    )
    plot_anoms_plotly(fig, anomaly_labels=labels_train)
    fig.show()

    df_labels = labels_train.to_pd()
    df_labels.resample(time_bucket_size).fillna("bfill")

    def set_anom_score(created_at):
        lookup = created_at.tz_convert(None).ceil(time_bucket_size)
        if lookup in df_labels.index:
            return df_labels.loc[lookup].anom_score
        else:
            return 0    

    ## Label df_tweets with anomaly
    df_tweets["merlion_anomaly_" + column] = df_tweets.created_at.apply(
        set_anom_score
    )
    return df_scores

#anomaly_scores = detect_anomalies(
#    df_timeseries, 
#    "total_count"
#)

In [48]:
import plotly.express as px


px.line(
    df_timeseries.total_count
)

In [49]:
from merlion.transform.moving_average import MovingAverage
from merlion.post_process.threshold import Threshold

prophet_config = ProphetDetectorConfig(
    threshold=Threshold(alm_threshold=0.5, abs_score=False),
    # transform=MovingAverage(n_steps=3),
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=False,
    add_seasonality=False,  
    uncertainty_samples=500
)

#anomaly_scores = detect_anomalies(
#    df_timeseries, 
#    "total_count",
#    model=ProphetDetector(prophet_config)
#)

Initial log joint probability = -2.97498
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       915.772   2.51054e-05        100.39      0.4951      0.4951      122   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     123       915.774   4.56812e-09       102.352      0.1672      0.1672      153   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


In [50]:
for feat in ["total_count", "top1_hashtag_count", "top2_hashtag_count", "top3_hashtag_count"]:
    detect_anomalies(
        df_timeseries, 
        feat,
        model=ProphetDetector(prophet_config)
    )

Initial log joint probability = -2.97498
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       915.772   2.51054e-05        100.39      0.4951      0.4951      122   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     123       915.774   4.56812e-09       102.352      0.1672      0.1672      153   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -4.01978
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      48       762.924   2.23954e-08       98.5796      0.1877           1       75   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


Initial log joint probability = -2.865
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      22       940.594   5.16233e-07        100.18      0.2362      0.7064       46   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


Initial log joint probability = -2.83679
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      32       947.541    2.0869e-07       99.8463      0.2164           1       56   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


In [51]:
# Export 
df_tweets.to_json(
    selected_dataset["output_location"],
    orient="records"
)
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36349 entries, 36348 to 0
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   id                                  36349 non-null  object             
 1   text                                36349 non-null  object             
 2   created_at                          36349 non-null  datetime64[ns, UTC]
 3   hashtags                            36349 non-null  object             
 4   mentions                            36349 non-null  object             
 5   in_reply_to_user_id                 9782 non-null   object             
 6   user_id                             36349 non-null  object             
 7   retweet_count                       36349 non-null  int64              
 8   quote_count                         36349 non-null  int64              
 9   reply_count                         363