In [1]:
from notebook_utils import resolve_paths_from_parent_directory
resolve_paths_from_parent_directory()
# auto reload notebook deps
%reload_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd

DATASETS = {
    "CentralParkNYC": {
        "input_location": "../data/tweet_objects/CentralParkNYC/2021-01-27-2021-02-06.json",
        "output_location": "../data/labeled_datasets/CentralParkNYC-2021-01-27-2021-02-06.json",
        "spike_start": "Feb 1, 2021, 17:00",
        "spike_end": "Feb 3, 2021, 04:00",
    },
    "united": {
        "input_location": "../data/tweet_objects/united/2020-12-05-2020-12-15.json",
        "output_location": "../data/labeled_datasets/united-2020-12-05-2020-12-15.json",
        "spike_start": "Dec 12, 2020, 05:00",
        "spike_end": "Dec 13, 2020, 06:00",
    },
    "nationalGridUS": {
        "input_location": "../data/tweet_objects/nationalGridUS/2020-10-01-2020-10-12.json",
        "output_location": "../data/labeled_datasets/nationalGridUS-2020-10-01-2020-10-12.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_gareth_bale": {
        "input_location": "../data/tweet_objects/soccer_gareth_bale/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/soccer_gareth_bale.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_raphael_varane": {
        "input_location": "../data/tweet_objects/soccer_raphael_varane/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/soccer_raphael_varane.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_martin_garrix": {
        "input_location": "../data/tweet_objects/celebrity_martin_garrix/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_martin_garrix.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_jk_rowling": {
        "input_location": "../data/tweet_objects/celebrity_jk_rowling/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_jk_rowling.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "voterfraud2020": {
        "input_location": "../data/voterfraud2020/processed/df_sample.json",
        "output_location": "../data/labeled_datasets/voterfraud2020_sample.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_jerry_seinfeld": {
        "input_location": "../data/tweet_objects/celebrity_jerry_seinfeld/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_jerry_seinfeld.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "celebrity_akon": {
        "input_location": "../data/tweet_objects/celebrity_akon/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/celebrity_akon.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_alan_shearer": {
        "input_location": "../data/tweet_objects/soccer_alan_shearer/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/soccer_alan_shearer.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "soccer_rioferdy5": {
        "input_location": "../data/tweet_objects/celebrity_soccer_rioferdy5/2021-01-01-2021-02-15.json",
        "output_location": "../data/labeled_datasets/soccer_rioferdy5.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    },
    "parishilton": {
        "input_location": "../data/tweet_objects/parishilton/2021-01-01-2021-04-01.json",
        "output_location": "../data/labeled_datasets/parishilton.json",
        "spike_start": "Oct 7, 2020, 17:00",
        "spike_end": "Oct 9, 2020, 04:00",
    }
}

selected_dataset = DATASETS["parishilton"]
labeled_spike_start = pd.to_datetime(selected_dataset["spike_start"]).tz_localize("UTC")
labeled_spike_end = pd.to_datetime(selected_dataset["spike_end"]).tz_localize("UTC")

In [3]:
from utils.dataset import create_tweet_df

if "voterfraud2020" in selected_dataset["input_location"]:
    df_tweets = pd.read_json(selected_dataset["input_location"])
    df_tweets["created_at"] = pd.to_datetime(df_tweets["created_at"]).dt.tz_localize("UTC")
    df_tweets.sort_values("created_at", inplace=True)
else:
    with open(selected_dataset["input_location"], "r") as f:
        raw_dataset = json.load(f)

    df_tweets = create_tweet_df(raw_dataset["tweets"])

# lower entities
df_tweets["hashtags"] = df_tweets.hashtags.apply(lambda xs: [x.lower() for x in xs])
df_tweets["mentions"] = df_tweets.mentions.apply(lambda xs: [x.lower() for x in xs])

# Set retweet count 0 for retweets
df_tweets["retweet_count"] = df_tweets.apply(
    lambda x: x.retweet_count if x.retweeted is None else 0, 
    axis=1
)

# Label tweets
df_tweets["is_anomaly"] = df_tweets.created_at.apply(
    lambda x: 0 if (x < labeled_spike_start or x > labeled_spike_end) else 1
)

df_tweets.head()

Unnamed: 0,id,text,created_at,hashtags,mentions,in_reply_to_user_id,user_id,retweet_count,quote_count,reply_count,like_count,replied_to,retweeted,quoted,is_anomaly
144254,1345285636090163202,@EdHaytam @rioferdy5 @ManUtd No that's your ma...,2021-01-02 08:27:44+00:00,[],"[rioferdy5, manutd]",1.2721998118995395e+18,945311776425611264,0,0,0,0,1.3451275891412375e+18,,,0
144253,1345285645191835648,@rioferdy5 @ManUtd This will be gold in a coup...,2021-01-02 08:27:46+00:00,[],"[rioferdy5, manutd]",155927976.0,92084834,0,0,0,0,1.3451273222786212e+18,,,0
144252,1345285698761486336,RT @rioferdy5: Welcome to 2️⃣0️⃣2️⃣1️⃣\nHappy ...,2021-01-02 08:27:59+00:00,[],[rioferdy5],,1344173235970191361,0,0,0,0,,1.3451273222786212e+18,,0
144251,1345285706361540608,RT @rioferdy5: Welcome to 2️⃣0️⃣2️⃣1️⃣\nHappy ...,2021-01-02 08:28:01+00:00,[],[rioferdy5],,442390383,0,0,0,0,,1.3451273222786212e+18,,0
144250,1345285806752231424,"@Footie888 @rioferdy5 @ManUtd Also, breaking n...",2021-01-02 08:28:25+00:00,[],"[footie888, rioferdy5, manutd]",2304928413.0,22637245,0,0,0,0,1.345221197722755e+18,,,0


In [4]:
## Create timeseries from tweets
from utils.dataset import count_array_column

df_top_hashtags = count_array_column(df_tweets["hashtags"])
df_top_mentions = count_array_column(df_tweets["mentions"])

df_top_hashtags[:20]

Unnamed: 0,value,count,pct
4,mufc,5195,0.036013
1210,flowersofmanchester,3780,0.026204
68,lfc,2165,0.015008
97,arsenal,1986,0.013767
721,munshu,1103,0.007646
60,facts,823,0.005705
482,munliv,762,0.005282
368,emiratesfacup,755,0.005234
341,livmun,382,0.002648
559,mulive,365,0.00253


In [5]:
def count_col_occurrence(df_col, value):
    return df_col.apply(lambda values: value in values).sum()

time_bucket_size = "300Min"

df_timeseries = df_tweets.groupby(df_tweets.created_at.dt.ceil(time_bucket_size)).agg(
    total_count=('id', 'count'), 
    is_anomaly=('is_anomaly', lambda x: x.any()),
    retweet_count=('retweeted', lambda x: pd.notna(x).sum()),
    quote_count=('quoted', lambda x: pd.notna(x).sum()),
    # replied_to_count=('replied_to', lambda x: pd.notna(x).sum()),
    top1_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[0].value)
    ),
    top2_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[1].value)
    ),
    top3_hashtag_count=(
        'hashtags', 
        lambda x: count_col_occurrence(x, df_top_hashtags.iloc[2].value)
    ),
    top1_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[0].value)
    ),
    top2_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[1].value)
    ),
    top3_mention_count=(
        'mentions', 
        lambda x: count_col_occurrence(x, df_top_mentions.iloc[2].value)
    )
)
df_timeseries.head()

Unnamed: 0_level_0,total_count,is_anomaly,retweet_count,quote_count,top1_hashtag_count,top2_hashtag_count,top3_hashtag_count,top1_mention_count,top2_mention_count,top3_mention_count
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-02 09:00:00+00:00,142,False,107,0,0,0,0,142,0,0
2021-01-02 14:00:00+00:00,1634,False,886,11,3,0,0,1577,0,295
2021-01-02 19:00:00+00:00,902,False,497,11,0,0,0,865,0,126
2021-01-03 00:00:00+00:00,377,False,222,4,0,0,0,368,0,37
2021-01-03 05:00:00+00:00,75,False,52,1,0,0,0,73,0,10


In [6]:
from merlion.utils import TimeSeries
from merlion.models.anomaly.forecast_based.prophet import ProphetDetector, ProphetDetectorConfig
from merlion.plot import plot_anoms_plotly
from merlion.post_process.threshold import Threshold

def convert_to_merlion(df, column):
    df = df.copy()
    df = df[column]
    df.index = df.index.tz_convert(None)
    return TimeSeries.from_pd(df)

def detect_anomalies(
    df_timeseries, 
    column,
    model=ProphetDetector(ProphetDetectorConfig(
        threshold=Threshold(alm_threshold=0.5, abs_score=False),
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
        add_seasonality=False,
        uncertainty_samples=500,
    )),
    plot=True
):
    train_data = convert_to_merlion(df_timeseries, column)
    anomaly_score = model.train(train_data=train_data, anomaly_labels=None)
    scores = model.get_anomaly_score(train_data)
    df_scores = scores.to_pd()
    labels_train = model.get_anomaly_label(train_data)

    fig = model.plot_anomaly_plotly(
        time_series=train_data,
        plot_forecast=True,
        plot_forecast_uncertainty=True
    )
    plot_anoms_plotly(fig, anomaly_labels=labels_train)
    fig.show()

    df_labels = labels_train.to_pd()
    df_labels.resample(time_bucket_size).fillna("bfill")

    def set_anom_score(created_at):
        lookup = created_at.tz_convert(None).ceil(time_bucket_size)
        if lookup in df_labels.index:
            return df_labels.loc[lookup].anom_score
        else:
            return 0    

    ## Label df_tweets with anomaly
    df_tweets["merlion_anomaly_" + column] = df_tweets.created_at.apply(
        set_anom_score
    )
    return df_scores

#anomaly_scores = detect_anomalies(
#    df_timeseries, 
#    "total_count"
#)

In [7]:
import plotly.express as px


px.line(
    df_timeseries.total_count
)

In [8]:
from merlion.transform.moving_average import MovingAverage
from merlion.post_process.threshold import Threshold

prophet_config = ProphetDetectorConfig(
    threshold=Threshold(alm_threshold=0.5, abs_score=False),
    # transform=MovingAverage(n_steps=3),
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=False,
    add_seasonality=False,  
    uncertainty_samples=500
)

#anomaly_scores = detect_anomalies(
#    df_timeseries, 
#    "total_count",
#    model=ProphetDetector(prophet_config)
#)

In [9]:
for feat in ["total_count", "top1_hashtag_count", "top2_hashtag_count", "top3_hashtag_count"]:
    detect_anomalies(
        df_timeseries, 
        feat,
        model=ProphetDetector(prophet_config)
    )

Initial log joint probability = -7.05513
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       266.688   8.63992e-08       96.8364      0.3553           1      127   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     105       266.688   3.68214e-09       99.5536      0.3712      0.3712      135   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -3.25995
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      57       370.467   1.33969e-08       89.5294      0.7899      0.7899       81   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


Initial log joint probability = -2.74529
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      45       426.919   1.56263e-09       99.4957      0.1152      0.1152       68   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


Initial log joint probability = -2.73167
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      39       428.901   2.70874e-09        97.857      0.1472      0.1472       64   
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance


In [10]:
# Export 
df_tweets.to_json(
    selected_dataset["output_location"],
    orient="records"
)
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144255 entries, 144254 to 0
Data columns (total 19 columns):
 #   Column                              Non-Null Count   Dtype              
---  ------                              --------------   -----              
 0   id                                  144255 non-null  object             
 1   text                                144255 non-null  object             
 2   created_at                          144255 non-null  datetime64[ns, UTC]
 3   hashtags                            144255 non-null  object             
 4   mentions                            144255 non-null  object             
 5   in_reply_to_user_id                 41267 non-null   object             
 6   user_id                             144255 non-null  object             
 7   retweet_count                       144255 non-null  int64              
 8   quote_count                         144255 non-null  int64              
 9   reply_count               