In [1]:
import pandas as pd
from pathlib import Path
import json

# load snapshots
base_path = Path('/data/blockchain-interoperability/blockchain-social-media/twitter-data/')
timestamp = pd.to_datetime(pd.read_pickle(base_path / 'snapshots/timestamp_ms.pkl'),unit='ms')
text = pd.read_pickle(base_path / 'snapshots/whole_text.pkl')

# load cluster info
cluster_ids = json.load(open(base_path/'kmeans_clusters/kmeans_init_clusters.json'))
index_to_cluster = {
    idx:int(c_id)
    for c_id, idxs in cluster_ids.items() 
    for idx in idxs
}

# load sentiment
sentiment = pd.read_pickle(base_path / 'sentiment/transformer/sentiment.pkl')
sentiment_score = pd.read_pickle(base_path / 'sentiment/transformer/sentiment_score.pkl')

In [2]:
len(sentiment),len(text)

(14973497, 14973497)

In [3]:
# turn columns into dataframe

df = pd.concat([timestamp, text, sentiment, sentiment_score],axis=1)
df['cluster_id'] = df.index.map(index_to_cluster)


In [37]:
def get_sent_stat(timestamp,subgroup):
    # print(subgroup)
    # print(subgroup.columns)
    return {
        'timestamp': timestamp, 
        'tweet_count': len(subgroup),
        'neg_count': (subgroup['sentiment'] == -1).sum(),
        'neu_count': (subgroup['sentiment'] == 0).sum(),
        'pos_count': (subgroup['sentiment'] == 1).sum(),
        'average': (subgroup['sentiment'].mean() + 1)/2, 
        'neg_score_avg': subgroup[subgroup['sentiment'] == -1]['sentiment_score'].mean(),
        'neu_score_avg': subgroup[subgroup['sentiment'] == 0]['sentiment_score'].mean(),
        'pos_score_avg': subgroup[subgroup['sentiment'] == 1]['sentiment_score'].mean(),
    }

def get_timeseries(df):
    per_delta = pd.DataFrame([get_sent_stat(t,s) for t,s in df.resample('30min', on='timestamp_ms') ])
    return per_delta

# df.resample('30min', on='timestamp_ms').agg(get_sent_stat)

In [38]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def draw_sentiment_timeseries(cluster_idx=None):
    if cluster_idx is not None:
        per_delta = get_timeseries(df[df['cluster_id'] == cluster_idx])
        title = f'Cluster {cluster_idx+1}'
    else:
        per_delta = get_timeseries(df)
        title = 'All Clusters'

    fig = go.Figure(
        data = [
            go.Bar(
                x = per_delta['timestamp'],
                y = per_delta['neg_count'] / per_delta['tweet_count'],
                name = 'Negative Sentiment Ratio',
                marker_color = '#e8776b',
            ),
            go.Bar(
                x = per_delta['timestamp'],
                y = per_delta['neu_count'] / per_delta['tweet_count'],
                name = 'Neutral Sentiment Ratio',
                marker_color = '#e6e6e6',
            ),
            go.Bar(
                x = per_delta['timestamp'],
                y = per_delta['pos_count'] / per_delta['tweet_count'],
                name = 'Positive Sentiment Ratio',
                marker_color = '#6b95e8',
                # yaxis='y2',
            ),
            go.Scatter(
                name = 'Average Sentiment',
                x = per_delta['timestamp'],
                y = per_delta['average'],
                line_color = '#44c767',
            )
        ],
        layout = {
            'barmode':'stack',
        }
    )
    
    fig.show()
    return fig



all_figs = [
    draw_sentiment_timeseries(i)
    for i in [None] + list(range(5))
]
