In [2]:
import sys
import os

import plotly.express as px
import plotly.offline as pyo

import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"

import os.path

sys.path.insert(1, '/u/aedinge/Twitter_Scraping')

# read processed sample data
sampled = pd.read_pickle("/l/mesur/aedinge/monkeypox_twitter/010sample_091222.pkl")
print(sampled.columns)



Index(['index', 'created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'is_quote_status',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted',
       'possibly_sensitive', 'lang', 'extended_entities', 'quoted_status_id',
       'quoted_status_id_str', 'quoted_status_permalink', 'quoted_status',
       'withheld_in_countries', 'scopes', 'withheld_scope',
       'withheld_copyright', 'date', 'text_proc', 'text_proc_length',
       'hover_text', 'size', 'wk', 'y', 'x', 'k_means_category', 'time_color'],
      dtype='object')


In [3]:
sampled['log_rt'] = sampled['retweet_count'].apply(lambda x:np.log10(x))

cluster_names = {
    0:'Noise - Outside', 
    1:'Noise - Middle', 
    2:'Case reports', 
    3:'Transmissibility, MSM communities', 
    4:'WHO Emergency', 
    5:'Vaccines', 
    6:'Monkeypox vs. Covid'
}

sampled['label'] = sampled['k_means_category'].apply(lambda x: cluster_names[x])

In [4]:
# setting up color map for plotting with standardized cluster colors across graphics
categories = set(sampled['k_means_category'])
col = px.colors.qualitative.Plotly[:len(categories)]
gray = '#BAB0AC'

color_map = dict(zip(categories,  col))
print(color_map)

{0: '#636EFA', 1: '#EF553B', 2: '#00CC96', 3: '#AB63FA', 4: '#FFA15A', 5: '#19D3F3', 6: '#FF6692'}


In [5]:
# color by label name
# setting up color map for plotting with standardized cluster colors across graphics
categories = cluster_names.values()
col = px.colors.qualitative.Plotly[:len(categories)]
gray = '#BAB0AC'

color_map_l = dict(zip(categories,  col))
print(color_map_l)

{'Noise - Outside': '#636EFA', 'Noise - Middle': '#EF553B', 'Case reports': '#00CC96', 'Transmissibility, MSM communities': '#AB63FA', 'WHO Emergency': '#FFA15A', 'Vaccines': '#19D3F3', 'Monkeypox vs. Covid': '#FF6692'}


In [12]:
fig = px.histogram(sampled, 
                   x="log_rt", 
                   color="label",
                   opacity=0.9,
                   log_y=True,
                   facet_col="label",
                   facet_col_wrap=2,
                   color_discrete_map=color_map_l,
                   title = "Topic Retweet Distributions"
                  )

fig.update_layout(
    autosize=False,
    width=900,
    height=900,
)

fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(showticklabels=True)
fig.update(layout_showlegend=False)

fig.show()

In [7]:
fig = px.histogram(sampled, 
                   x="log_rt", 
                   color="label",
                   opacity=0.6,
                   log_y=True,
                   marginal="violin",
                   color_discrete_map=color_map_l,
                  )

fig.update_layout(
    autosize=False,
    width=900,
    height=900,
)

fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.update_layout(barmode='overlay')

fig.show()

In [11]:
fig = px.violin(
    sampled, 
    y="log_rt", 
    x="label", 
    color="label", 
    box=False, 
    points="all",
    color_discrete_map=color_map_l,
    title="Topic Retweet Distributions"
)

fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})


fig.update_yaxes(title='Log Number of Retweets')

fig.update(layout_showlegend=False)

fig.show()

In [None]:
len(sampled[sampled['retweet_count']!=0])