# Stats

In [None]:
import pandas as pd
from datetime import datetime
import time
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import ruptures as rpt

In [None]:
import kaleido

In [None]:
from twitter_id_mapper import get_handle

## Read Data

In [None]:
df_tweets = pd.read_parquet('data/tweets/_all_cleaned/tweets.parquet')

In [None]:
df_tweets['created_at'] = pd.to_datetime(df_tweets.created_at)
df_tweets['created_year_month'] = df_tweets.created_at.dt.year.astype(str) + '_' + df_tweets.created_at.dt.month.astype(str).str.zfill(2)
df_tweets['created_date'] = df_tweets.created_at.dt.date

In [None]:
print(df_tweets.created_at.min())
print(df_tweets.created_at.max())

In [None]:
df_tweets

In [None]:
df_users = pd.read_parquet('data/tweets/_all_cleaned/users.parquet')

In [None]:
df_users

## Tweets Per User

In [None]:
auth_count = df_tweets.groupby(by=['author_id']).agg({'id': 'count'}).reset_index().rename(columns={'id': 'count'})
auth_count

In [None]:
sns.displot(auth_count, log_scale=(False, 10), height=4, aspect=1.5, bins=100, stat='count')
title = plt.title('Distribution of Tweet Counts')
plt.xlabel('Tweet Counts')
plt.ylabel('Number of Users')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.histogram(auth_count, x='count', log_y=True, nbins=200, title='Number of users per tweet count'.title(), height=700)
fig.update_layout(
    xaxis_title="Tweet Count",
    yaxis_title="Author Count",
    title_x=0.5,
)

In [None]:
df_users.merge(auth_count[auth_count['count'] > 1000], left_on='id', right_on='author_id', how='inner').sort_values(by='count')

## Language Stats

In [None]:
lang_count = df_tweets.groupby(by='lang')['id'].count().reset_index().rename(columns={'id': 'count'})

In [None]:
lang_count

In [None]:
lang_count.sort_values(by='count', ascending=False, inplace=True)

In [None]:
lang_count

In [None]:
g = sns.catplot(data=lang_count, x='lang', y='count', height=4, aspect=4, kind='bar', color='cadetblue')
#for ax in g.axes.flat:
#    ax.bar_label(ax.containers[0])
title = plt.title('Tweet Count Per Language')
plt.xlabel('Language Code')
plt.ylabel('Tweet Count')
plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.bar(lang_count, x='lang', y='count', log_y=True, title='tweet count per language'.title(), height=700)
fig.update_layout(
    xaxis_title="Language",
    yaxis_title="Tweet Count",
    title_x=0.5,
)
fig.update_xaxes(tickangle=0)

In [None]:
fig = px.pie(df_tweets, names='lang', title='Language proportions'.title(), height=500)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title_x=0.49,)
fig.write_image(f"plots/stats/{fig.layout.title.text}.svg")
fig.show()

In [None]:
for y_m in sorted(df_tweets.created_year_month.unique()):
    fig = px.pie(df_tweets[df_tweets.created_year_month == y_m], names='lang', title=f'Language proportions month {y_m}'.title(), height=700)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title_x=0.5,)
    fig.show()

## Tweets Per Day

In [None]:
tweets_per_day = df_tweets[['created_date', 'id']].groupby('created_date').count().reset_index().rename(columns={'id': 'count'})

In [None]:
sns.relplot(data=tweets_per_day, x="created_date", y="count", height=4, aspect=1.5, kind='line', color='cadetblue')
title = plt.title('Tweet Counts per Day')
plt.xlabel('Date')
plt.ylabel('Tweet Count')
plt.xticks(rotation=15)
#plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.line(tweets_per_day, x="created_date", y="count", title='Tweet Counts per Day'.title(), height=700)
"""fig.add_vline(
            x=time.mktime(datetime.strptime('2022-02-23', '%Y-%m-%d').timetuple())*10**3,
            line_width=1,
            line_dash='dash',
            line_color='grey',
            annotation_text='Ukrainian Invasion Start',
            annotation_font_color='grey',
            annotation_position='top left',
            annotation_textangle=-90,
        )"""
fig.update_layout(title_x=0.5,)
fig.show()

In [None]:
tweets_per_day_lang = df_tweets[['created_date', 'lang', 'id']].groupby(['created_date', 'lang']).count().reset_index().rename(columns={'id': 'count'})

In [None]:
sns.relplot(data=tweets_per_day_lang[tweets_per_day_lang.lang == 'en'], x="created_date", y="count", height=4, aspect=1.5, kind='line', color='cadetblue')
title = plt.title('Tweets in English Counts per Day')
plt.xlabel('Date')
plt.ylabel('Tweet Count')
plt.xticks(rotation=15)
#plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
sns.relplot(data=tweets_per_day_lang[tweets_per_day_lang.lang == 'und'], x="created_date", y="count", height=4, aspect=1.5, kind='line', color='cadetblue')
title = plt.title('Tweets in Undefined Language Counts per Day')
plt.xlabel('Date')
plt.ylabel('Tweet Count')
plt.xticks(rotation=15)
#plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.line(tweets_per_day_lang.sort_values(['lang', 'created_date']), x="created_date", y="count", color='lang', title='Tweet Counts per Day and per Language'.title(), height=700)
"""fig.add_vline(
            x=time.mktime(datetime.strptime('2022-02-23', '%Y-%m-%d').timetuple())*10**3,
            line_width=1,
            line_dash='dash',
            line_color='grey',
            annotation_text='2022-02-23<br>Ukrainian Invasion Start',
            annotation_font_color='grey',
            annotation_position='top left',
            annotation_textangle=-90,
        )"""
fig.update_layout(title_x=0.5,)
fig.write_html(f'interactive/{fig.layout.title.text}.html')
fig.show()

In [None]:
tweets_per_day_lang

In [None]:
tweets_per_day_lang_pvt = tweets_per_day_lang.pivot(index='created_date', columns='lang', values='count').fillna(0)
tweets_per_day_lang_pvt['_all'] = tweets_per_day_lang_pvt.sum(1)
tweets_per_day_lang_pvt = tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns.sort_values()]
tweets_per_day_lang_pvt

In [None]:
change_locations = {col: rpt.Pelt(model="rbf").fit(tweets_per_day_lang_pvt[col].values).predict(pen=2) for col in tweets_per_day_lang_pvt.columns}

In [None]:
languages = ['_all', 'en', 'es', 'fr', 'uk', 'ru', 'ro', 'pl', 'tr']

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=tweets_per_day_lang_pvt.index,
                         y=tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns[0]],
                         visible=True,
                         name='Tweet Counts'))

fig.add_trace(go.Scatter(x=tweets_per_day_lang_pvt.index[change_locations[tweets_per_day_lang_pvt.columns[0]][:-1]],
                         y=tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns[0]].loc[tweets_per_day_lang_pvt.index[change_locations[tweets_per_day_lang_pvt.columns[0]][:-1]]],
                         visible=True,
                         mode='markers',
                         marker=dict(
                             symbol='x',
                             opacity=0.8,
                             size=15,
                         ),
                         name='Change Points',
                         hoverinfo='skip'))

fig.update_layout(
    updatemenus=[
        dict(
            buttons=[dict(method='restyle',
                          label=col,
                          visible=True,
                          args=[{'y': [tweets_per_day_lang_pvt[col].values, tweets_per_day_lang_pvt[col].loc[tweets_per_day_lang_pvt.index[change_locations[col][:-1]]].values],
                                 'x': [tweets_per_day_lang_pvt.index.values, tweets_per_day_lang_pvt.index[change_locations[col][:-1]].values],
                                 'type':'scatter'},
                               ],
                         )
                     for col in tweets_per_day_lang_pvt.columns
                    ],
            direction="down",
            showactive=True,
            x=0,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ],
    height=700,
    title={
        'text': 'Tweet Counts per Day'.title(),
        'x':0.5,
        'xanchor': 'center'},
)

fig.write_html(f'interactive/{fig.layout.title.text}.html')

fig.show()

## Withheld Tweets

In [None]:
df_tweets['withheld_countries'] = df_tweets.withheld.apply(lambda d: d['country_codes'] if type(d) == dict else d)

In [None]:
df_tweets.withheld_countries

In [None]:
df_tweets.withheld_countries.dropna()

In [None]:
df_tweets.withheld_countries.dropna().apply(tuple).unique().tolist()

In [None]:
withheld_lang = df_tweets[['lang', 'withheld_countries']].explode('withheld_countries').dropna()
withheld_lang

In [None]:
withheld_lang = withheld_lang.value_counts().to_frame(name='count').reset_index()
withheld_lang

In [None]:
withheld_lang_sorted = withheld_lang.merge(withheld_lang.groupby('withheld_countries').sum().reset_index(), on='withheld_countries').sort_values(['count_y', 'count_x', 'withheld_countries'], ascending=[False, True, True])
withheld_lang_sorted

In [None]:
g = sns.displot(data=withheld_lang_sorted, x='withheld_countries', weights='count_x', hue='lang', discrete=True, height=4, aspect=1.75, multiple='stack', shrink=0.7)
#for ax in g.axes.flat:
#    ax.bar_label(ax.containers[0])
title = plt.title('Withheld Tweet Count Per Country')
plt.xlabel('Country Code')
plt.ylabel('Number of Tweets')
#plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.bar(withheld_lang, x='withheld_countries', y='count', color='lang', height=700)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [None]:
withheld_tweets = df_tweets.dropna(subset=['withheld_countries']).copy()

In [None]:
withheld_tweets['withheld_countries'] = withheld_tweets.withheld_countries.apply(tuple)

In [None]:
grouped_withheld_tweets = withheld_tweets.groupby(by=['withheld_countries', 'author_id']).agg({'id': 'count', 'lang': set, 'text': list, 'created_at': list}).reset_index()
grouped_withheld_tweets

In [None]:
grouped_withheld_tweets['author_handle'] = grouped_withheld_tweets.author_id.apply(get_handle)
grouped_withheld_tweets[['withheld_countries', 'author_id', 'author_handle', 'id', 'lang', 'text', 'created_at']]

In [None]:
for i, row in grouped_withheld_tweets.iterrows():
    print(row['withheld_countries'])
    print(row['author_id'], '--->', row['author_handle'])
    print(row['lang'])
    for txt, date in zip(row['text'], row['created_at']):
        print()
        print('  #', date)
        print(txt)
    print('\n\n*********\n')

## Withheld Users

In [None]:
df_users.withheld.dropna().iloc[0]

In [None]:
df_users['withheld_countries'] = df_users.withheld.apply(lambda d: d['country_codes'] if type(d) == dict else d)

In [None]:
df_users.withheld_countries

In [None]:
df_users.withheld_countries.dropna()

In [None]:
df_users.withheld_countries.dropna().apply(tuple).unique().tolist()

In [None]:
df_users

In [None]:
withheld_users_countries = df_users[['id', 'username', 'name', 'withheld_countries']].explode('withheld_countries').dropna()
withheld_users_countries

In [None]:
withheld_users_countries_count = withheld_users_countries.groupby('withheld_countries')['id'].count().to_frame(name='count').reset_index()
withheld_users_countries_count

In [None]:
g = sns.displot(data=withheld_users_countries_count.sort_values(['count', 'withheld_countries'], ascending=[False, True]), x='withheld_countries', weights='count', discrete=True, height=4, aspect=1.75, shrink=0.7)
#for ax in g.axes.flat:
#    ax.bar_label(ax.containers[0])
title = plt.title('Withheld Users Count Per Country')
plt.xlabel('Country Code')
plt.ylabel('Number of Users')
#plt.yscale('log')
plt.savefig('plots/stats/' + title.get_text() + '.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
fig = px.bar(withheld_users_countries_count, x='withheld_countries', y='count', height=700)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [None]:
withheld_users = df_users.dropna(subset=['withheld_countries']).copy()

In [None]:
withheld_users['withheld_countries'] = withheld_users.withheld_countries.apply(tuple)

In [None]:
len(withheld_users)

In [None]:
withheld_users.head()

In [None]:
grouped_withheld_users = withheld_users.groupby(by=['withheld_countries']).agg({'id': list, 'username': list, 'name': list}).reset_index()
grouped_withheld_users

In [None]:
for i, row in grouped_withheld_users.iterrows():
    print(row['withheld_countries'])
    for un, n, i in zip(row['username'], row['name'], row['id']):
        print('  #', un, f'({i})', '->', n)
    print('\n\n*********\n')

## Undefined Language Tweets 

In [None]:
und_mar_2022_tweets = df_tweets[(df_tweets.lang == 'und') & (df_tweets.created_year_month == '2022_03')]

In [None]:
und_mar_2022_tweets.iloc[1]

In [None]:
print(und_mar_2022_tweets.iloc[1].text)

In [None]:
und_mar_2022_tweets.text.to_csv('dump/und.csv')

In [None]:
import numpy as np

In [None]:
# Tweets that are not just hashtags, links or mentions
und_mar_2022_tweets_texts = und_mar_2022_tweets.loc[und_mar_2022_tweets.text.str.split().apply(lambda l: [x for x in l if '@' not in x and 'http' not in x and '#' not in x]).apply(lambda l: np.nan if len(l) == 0 else l).dropna().index, 'text']

In [None]:
und_mar_2022_tweets_texts

In [None]:
df_tweets.loc[und_mar_2022_tweets_texts.index][['id', 'text']].set_index('id').to_json('dump/filtered_und.json')