In [130]:
from notebook_utils import resolve_paths_from_parent_directory
resolve_paths_from_parent_directory()
# auto reload notebook deps
%reload_ext autoreload
%autoreload 2

In [131]:
import json
import pandas as pd

DATASETS = {
    "CentralParkNYC": {
        "input_location": "../data/tweet_objects/CentralParkNYC/2021-01-27-2021-02-06.json",
        "output_location": "../data/labeled_datasets/CentralParkNYC-2021-01-27-2021-02-06.json",
        "spike_start": "Feb 1, 2021, 17:00",
        "spike_end": "Feb 3, 2021, 04:00",
    },
    "united": {
        "input_location": "../data/tweet_objects/united/2020-12-05-2020-12-15.json",
        "output_location": "../data/labeled_datasets/united-2020-12-05-2020-12-15.json",
        "spike_start": "Dec 12, 2020, 05:00",
        "spike_end": "Dec 13, 2020, 06:00",
    },
    "nationalGridUS": {
        "input_location": "../data/tweet_objects/nationalGridUS/2020-10-01-2020-10-12.json",
        "output_location": "../data/labeled_datasets/nationalGridUS-2020-10-01-2020-10-12.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    }
}

selected_dataset = DATASETS["nationalGridUS"]
labeled_spike_start = pd.to_datetime(selected_dataset["spike_start"]).tz_localize("UTC")
labeled_spike_end = pd.to_datetime(selected_dataset["spike_end"]).tz_localize("UTC")

In [132]:
from utils.dataset import create_tweet_df

with open(selected_dataset["input_location"], "r") as f:
    raw_dataset = json.load(f)

df_tweets = create_tweet_df(raw_dataset["tweets"])

# lower entities
df_tweets["hashtags"] = df_tweets.hashtags.apply(lambda xs: [x.lower() for x in xs])
df_tweets["mentions"] = df_tweets.mentions.apply(lambda xs: [x.lower() for x in xs])

# Set retweet count 0 for retweets
df_tweets["retweet_count"] = df_tweets.apply(
    lambda x: x.retweet_count if x.retweeted is None else 0, 
    axis=1
)

# Label tweets
df_tweets["is_anomaly"] = df_tweets.created_at.apply(
    lambda x: 0 if (x < labeled_spike_start or x > labeled_spike_end) else 1
)

df_tweets.head()

Unnamed: 0,id,text,created_at,hashtags,mentions,in_reply_to_user_id,user_id,retweet_count,quote_count,reply_count,like_count,retweeted,replied_to,quoted,is_anomaly
2125,1311464085071036416,@nationalgridus YESS!!! It’s back on,2020-10-01 00:32:58+00:00,[],[nationalgridus],109524559.0,812872943134982144,0,0,0,0,,1.3114163367870915e+18,,0
2124,1311464351459672065,@ToSaveEnergy @SCE @LimeEnergy @NYPAenergy @na...,2020-10-01 00:34:01+00:00,"[maintenance, cooling, energyefficiency, refri...","[tosaveenergy, sce, limeenergy, nypaenergy, na...",17881345.0,1591198093,0,0,0,0,,1.3113695498376763e+18,,0
2123,1311464409559166976,RT @ToSaveEnergy: Another insightful #ActiveEf...,2020-10-01 00:34:15+00:00,[activeefficiency],[tosaveenergy],,1591198093,0,0,0,0,1.3113695498376763e+18,,,0
2122,1311469381868814337,@DeanFelicetti Hi Dean. We understand not havi...,2020-10-01 00:54:01+00:00,[],[deanfelicetti],3225669874.0,109524559,0,0,0,0,,1.3114044246617375e+18,,0
2121,1311471757967134722,@DeanFelicetti @nationalgridus Didn’t the wind...,2020-10-01 01:03:27+00:00,[],"[deanfelicetti, nationalgridus]",3225669874.0,170846420,0,0,0,0,,1.3114044246617375e+18,,0


In [133]:
## Create timeseries from tweets
from utils.dataset import count_array_column

df_top_hashtags = count_array_column(df_tweets["hashtags"])
df_top_mentions = count_array_column(df_tweets["mentions"])

df_top_hashtags[:5]

Unnamed: 0,value,count,pct
66,troyny,12,0.005644
76,poweroutage,11,0.005174
19,frackedgas,9,0.004233
31,netzero,9,0.004233
53,renewable,9,0.004233


In [134]:
def count_col_occurrence(df_col, value):
    return df_col.apply(lambda values: value in values).sum()

time_bucket_size = "60Min"

df_timeseries = df_tweets.groupby(df_tweets.created_at.dt.ceil(time_bucket_size)).agg(
    total_count=('id', 'count'), 
    is_anomaly=('is_anomaly', lambda x: x.any()),
    retweet_count=('retweeted', lambda x: pd.notna(x).sum()),
    quote_count=('quoted', lambda x: pd.notna(x).sum()),
    replied_to_count=('replied_to', lambda x: pd.notna(x).sum()),
    top1_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[0].value)
    ),
    top2_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[1].value)
    ),
    top3_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[2].value)
    ),
    top1_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[0].value)
    ),
    top2_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[1].value)
    ),
    top3_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[2].value)
    )
)
df_timeseries.head()

Unnamed: 0_level_0,total_count,is_anomaly,retweet_count,quote_count,replied_to_count,top1_hashtag_count,top2_hashtag_count,top3_hashtag_count,top1_mention_count,top2_mention_count,top3_mention_count
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-10-01 01:00:00+00:00,4,False,1,0,3,0,0,0,2,0,0
2020-10-01 02:00:00+00:00,3,False,1,0,1,0,0,0,3,0,0
2020-10-01 03:00:00+00:00,1,False,0,0,1,0,0,0,1,0,0
2020-10-01 05:00:00+00:00,2,False,0,0,0,0,0,0,2,0,0
2020-10-01 11:00:00+00:00,1,False,0,0,1,0,0,0,1,0,0


In [126]:
from merlion.utils import TimeSeries
from merlion.models.anomaly.forecast_based.prophet import ProphetDetector, ProphetDetectorConfig
from merlion.plot import plot_anoms_plotly
from merlion.post_process.threshold import Threshold

def convert_to_merlion(df, column):
    df = df.copy()
    df = df[column]
    df.index = df.index.tz_convert(None)
    return TimeSeries.from_pd(df)

def detect_anomalies(
    df_timeseries, 
    column,
    model=ProphetDetector(ProphetDetectorConfig(
        threshold=Threshold(alm_threshold=0.5),
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
        add_seasonality=False,
        uncertainty_samples=1000
    )),
    plot=True
):
    train_data = convert_to_merlion(df_timeseries, column)
    anomaly_score = model.train(train_data=train_data, anomaly_labels=None)
    scores = model.get_anomaly_score(train_data)
    df_scores = scores.to_pd()
    labels_train = model.get_anomaly_label(train_data)

    fig = model.plot_anomaly_plotly(
        time_series=train_data,
        plot_forecast=True,
        plot_forecast_uncertainty=True
    )
    plot_anoms_plotly(fig, anomaly_labels=labels_train)
    fig.show()

    df_labels = labels_train.to_pd()
    df_labels.resample(time_bucket_size).fillna("bfill")

    def set_anom_score(created_at):
        lookup = created_at.tz_convert(None).ceil(time_bucket_size)
        if lookup in df_labels.index:
            return df_labels.loc[lookup].anom_score
        else:
            return 0    

    ## Label df_tweets with anomaly
    df_tweets["merlion_anomaly_" + column] = df_tweets.created_at.apply(
        set_anom_score
    )
    return df_scores

anomaly_scores = detect_anomalies(
    df_timeseries, 
    "total_count"
)

Initial log joint probability = -7.51234
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       206.015   2.81469e-07       98.5857      0.7247      0.7247      128   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     196       206.391   6.83875e-09       91.1417      0.2957           1      249   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


In [127]:
from merlion.transform.moving_average import MovingAverage
from merlion.post_process.threshold import Threshold

prophet_config = ProphetDetectorConfig(
    threshold=Threshold(alm_threshold=0.5),
    transform=MovingAverage(n_steps=3),
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=False,
    add_seasonality=False,  
    uncertainty_samples=1000
)

anomaly_scores = detect_anomalies(
    df_timeseries, 
    "total_count",
    model=ProphetDetector(prophet_config)
)

Initial log joint probability = -5.79112
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       240.942    1.1693e-05       73.8893      0.5456      0.5456      123   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199        241.06   5.15566e-08       77.4327      0.2683      0.8701      263   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     203        241.06   8.31105e-09       72.3467       0.219       0.219      268   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


In [128]:
for feat in ["total_count", "top1_hashtag_count", "top2_hashtag_count", "top3_hashtag_count"]:
    detect_anomalies(
        df_timeseries, 
        feat,
        model=ProphetDetector(prophet_config)
    )

Initial log joint probability = -5.79112
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       240.942    1.1693e-05       73.8893      0.5456      0.5456      123   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199        241.06   5.15566e-08       77.4327      0.2683      0.8701      263   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     203        241.06   8.31105e-09       72.3467       0.219       0.219      268   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -4.5625
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       266.894   2.13183e-05       101.442           1           1      120   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     107       266.912   0.000108705       82.4613   1.015e-06       0.001      161  LS failed, Hessian reset 
     173       266.919   3.39712e-07       92.2758   3.189e-09       0.001      284  LS failed, Hessian reset 
     183       266.919   6.29181e-09       89.6834      0.3757      0.3757      297   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -4.15625
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      89       284.486   1.49371e-07       85.1398    1.85e-09       0.001      153  LS failed, Hessian reset 
      99       284.486   1.80863e-08       80.0315      0.2593      0.8434      165   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     100       284.486    4.5694e-09        78.889      0.2547      0.2547      166   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -6.875
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      87       205.549    0.00075685       89.2884   8.097e-06       0.001      143  LS failed, Hessian reset 
      99       205.591   8.26345e-06       67.7701      0.3676      0.3676      158   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     150       205.644   0.000678812       99.7859    6.62e-06       0.001      259  LS failed, Hessian reset 
     199       205.698   2.22557e-05       89.0028           1           1      319   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     210       205.713   0.000104759        95.659   1.098e-06       0.001      368  LS failed, Hessian reset 
     296       205.726   6.94426e-07       104.687   8.112e-09       0.001      521  LS failed, Hessian reset 
     299       205.726   9.58604e-08       62.2856      0.3544     

In [129]:
# Export 
df_tweets.to_json(
    selected_dataset["output_location"],
    orient="records"
)
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2126 entries, 2125 to 0
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   id                                  2126 non-null   object             
 1   text                                2126 non-null   object             
 2   created_at                          2126 non-null   datetime64[ns, UTC]
 3   hashtags                            2126 non-null   object             
 4   mentions                            2126 non-null   object             
 5   in_reply_to_user_id                 1302 non-null   object             
 6   user_id                             2126 non-null   object             
 7   retweet_count                       2126 non-null   int64              
 8   quote_count                         2126 non-null   int64              
 9   reply_count                         2126 