In [None]:
import pandas as pd
from datetime import datetime
import time
import plotly.express as px
import plotly.graph_objects as go
import ruptures as rpt

In [None]:
from twitter_id_mapper import get_handle

In [None]:
colors = sns.color_palette('pastel')

In [None]:
df_tweets = pd.read_parquet('data/dump/_all_cleaned/tweets.parquet')

In [None]:
df_tweets['created_at'] = pd.to_datetime(df_tweets.created_at)
df_tweets['created_year_month'] = df_tweets.created_at.dt.year.astype(str) + '_' + df_tweets.created_at.dt.month.astype(str).str.zfill(2)
df_tweets['created_date'] = df_tweets.created_at.dt.date

In [None]:
print(df_tweets.created_at.min())
print(df_tweets.created_at.max())

In [None]:
df_tweets

In [None]:
auth_count = df_tweets.groupby(by=['author_id']).agg({'id': 'count'}).reset_index().rename(columns={'id': 'count'})
auth_count

In [None]:
fig = px.histogram(auth_count, x='count', log_y=True, nbins=200, title='Number of users per tweet count'.title(), height=700)
fig.update_layout(
    xaxis_title="Tweet Count",
    yaxis_title="Author Count",
    title_x=0.5,
)

In [None]:
lang_count = df_tweets.groupby(by='lang')['id'].count().reset_index().rename(columns={'id': 'count'})

In [None]:
lang_count

In [None]:
lang_count.sort_values(by='count', ascending=False, inplace=True)

In [None]:
fig = px.bar(lang_count, x='lang', y='count', log_y=True, title='tweet count per language'.title(), height=700)
fig.update_layout(
    xaxis_title="Language",
    yaxis_title="Tweet Count",
    title_x=0.5,
)
fig.update_xaxes(tickangle=0)

In [None]:
tweets_per_day = df_tweets[['created_date', 'id']].groupby('created_date').count().reset_index().rename(columns={'id': 'count'})

In [None]:
fig = px.line(tweets_per_day, x="created_date", y="count", title='Tweet Counts per Day', height=700)
fig.add_vline(
            x=time.mktime(datetime.strptime('2022-02-23', '%Y-%m-%d').timetuple())*10**3,
            line_width=1,
            line_dash='dash',
            line_color='grey',
            annotation_text='Ukrainian Invasion Start',
            annotation_font_color='grey',
            annotation_position='top left',
            annotation_textangle=-90,
        )
fig.update_layout(title_x=0.5,)
fig.show()

In [None]:
tweets_per_day_lang = df_tweets[['created_date', 'lang', 'id']].groupby(['created_date', 'lang']).count().reset_index().rename(columns={'id': 'count'})

In [None]:
fig = px.line(tweets_per_day_lang, x="created_date", y="count", color='lang', title='Tweet Counts per Day and per Language', height=700)
fig.add_vline(
            x=time.mktime(datetime.strptime('2022-02-23', '%Y-%m-%d').timetuple())*10**3,
            line_width=1,
            line_dash='dash',
            line_color='grey',
            annotation_text='2022-02-23<br>Ukrainian Invasion Start',
            annotation_font_color='grey',
            annotation_position='top left',
            annotation_textangle=-90,
        )
fig.update_layout(title_x=0.5,)
fig.show()

In [None]:
tweets_per_day_lang

In [None]:
tweets_per_day_lang_pvt = tweets_per_day_lang.pivot(index='created_date', columns='lang', values='count').fillna(0)
tweets_per_day_lang_pvt['_all'] = tweets_per_day_lang_pvt.sum(1)
tweets_per_day_lang_pvt = tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns.sort_values()]
tweets_per_day_lang_pvt

In [None]:
change_locations = {col: rpt.Pelt(model="rbf").fit(tweets_per_day_lang_pvt[col].values).predict(pen=2) for col in tweets_per_day_lang_pvt.columns}

In [None]:
languages = ['_all', 'en', 'es', 'fr', 'uk', 'ru', 'ro', 'pl', 'tr']

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=tweets_per_day_lang_pvt.index,
                         y=tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns[0]],
                         visible=True,
                         name='Tweet Counts'))

fig.add_trace(go.Scatter(x=tweets_per_day_lang_pvt.index[change_locations[tweets_per_day_lang_pvt.columns[0]][:-1]],
                         y=tweets_per_day_lang_pvt[tweets_per_day_lang_pvt.columns[0]].loc[tweets_per_day_lang_pvt.index[change_locations[tweets_per_day_lang_pvt.columns[0]][:-1]]],
                         visible=True,
                         mode='markers',
                         name='Change Points',
                         hoverinfo='skip'))

fig.update_layout(
    updatemenus=[
        dict(
            buttons=[dict(method='restyle',
                          label=col,
                          visible=True,
                          args=[{'y': [tweets_per_day_lang_pvt[col].values, tweets_per_day_lang_pvt[col].loc[tweets_per_day_lang_pvt.index[change_locations[col][:-1]]].values],
                                 'x': [tweets_per_day_lang_pvt.index.values, tweets_per_day_lang_pvt.index[change_locations[col][:-1]].values],
                                 'type':'scatter'},
                               ],
                         )
                     for col in tweets_per_day_lang_pvt.columns
                    ],
            direction="down",
            showactive=True,
            x=0,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ],
    height=700,
    title={
        'text': 'Tweet Counts per Day',
        'x':0.5,
        'xanchor': 'center'},
)

fig.show()

In [None]:
fig = px.pie(df_tweets, names='lang', title='Language proportions', height=700)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title_x=0.5,)
fig.show()

In [None]:
for y_m in sorted(df_tweets.created_year_month.unique()):
    fig = px.pie(df_tweets[df_tweets.created_year_month == y_m], names='lang', title=f'Language proportions month {y_m}', height=700)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', title_x=0.5,)
    fig.show()

In [None]:
df_tweets['withheld_countries'] = df_tweets.withheld.apply(lambda d: d['country_codes'] if type(d) == dict else d)

In [None]:
df_tweets.withheld_countries

In [None]:
df_tweets.withheld_countries.dropna()

In [None]:
df_tweets.withheld_countries.dropna().apply(tuple).unique().tolist()

In [None]:
withheld_lang = df_tweets[['lang', 'withheld_countries']].explode('withheld_countries').dropna()
withheld_lang

In [None]:
withheld_lang = withheld_lang.value_counts().to_frame(name='count').reset_index()
withheld_lang

In [None]:
fig = px.bar(withheld_lang, x='withheld_countries', y='count', color='lang', height=700)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [None]:
withheld_tweets = df_tweets.dropna(subset=['withheld_countries']).copy()

In [None]:
withheld_tweets['withheld_countries'] = withheld_tweets.withheld_countries.apply(tuple)

In [None]:
grouped_withheld_tweets = withheld_tweets.groupby(by=['withheld_countries', 'author_id']).agg({'id': 'count', 'lang': set, 'text': list, 'created_at': list}).reset_index()
grouped_withheld_tweets

In [None]:
grouped_withheld_tweets['author_handle'] = grouped_withheld_tweets.author_id.apply(get_handle)
grouped_withheld_tweets[['withheld_countries', 'author_id', 'author_handle', 'id', 'lang', 'text', 'created_at']]

In [None]:
get_handle(100731315)

In [None]:
for i, row in grouped_withheld_tweets.iterrows():
    print(row['withheld_countries'])
    print(row['author_id'], '--->', row['author_handle'])
    print(row['lang'])
    for txt, date in zip(row['text'], row['created_at']):
        print()
        print('  #', date)
        print(txt)
    print('\n\n*********\n')

In [None]:
und_mar_2022_tweets = df_tweets[(df_tweets.lang == 'und') & (df_tweets.created_year_month == '2022_03')]

In [None]:
und_mar_2022_tweets.iloc[1]

In [None]:
print(und_mar_2022_tweets.iloc[1].text)

In [None]:
und_mar_2022_tweets.text.to_csv('und.csv')

In [None]:
import numpy as np

In [None]:
und_mar_2022_tweets_texts = und_mar_2022_tweets.loc[und_mar_2022_tweets.text.str.split().apply(lambda l: [x for x in l if '@' not in x and 'http' not in x and '#' not in x]).apply(lambda l: np.nan if len(l) == 0 else l).dropna().index, 'text']

In [None]:
und_mar_2022_tweets_texts

In [None]:
und_mar_2022_tweets_texts.to_json('filtered_und.json')