In [6]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from twitter_scraper import settings
# from twitter_scraper.text.tweets import detect_language
from twitter_scraper.text.tweets import clean_twitter_text
from twitter_scraper.clean.users import USER_DTYPE
from twitter_scraper.clean.tweets import TWEET_DTYPE


users_df = pd.read_csv(settings.USERS_CSV, dtype=USER_DTYPE)
tweets_df = pd.read_csv(settings.TWEETS_CSV, dtype=TWEET_DTYPE)
retweets_df = pd.read_csv(settings.EDGES_RETWEETS_CSV)
retweets_df = retweets_df.loc[retweets_df['source'] != retweets_df['target']]

user_idx = users_df[['user_id', 'screen_name']].set_index('user_id').to_dict()['screen_name']
retweets_df['source'] = retweets_df['source'].transform(lambda x: user_idx[x])
retweets_df['target'] = retweets_df['target'].transform(lambda x: user_idx[x])
retweets_df['full_text'] = retweets_df['full_text'].transform(clean_twitter_text)
# retweets_df['langid'] = retweets_df['full_text'].transform(detect_language)

[INFO] 2022-10-29 09:05:19 langid.langid - initializing identifier


In [39]:
hr_text = retweets_df[['full_text', 'langid']].loc[4]['full_text']
print(hr_text)
retweets_df.loc[4]['rt_tweet_id']
tweets_df.loc[tweets_df['id'] == retweets_df.loc[4]['og_tweet_id']]['full_text'].to_dict()
from twitter_scraper.utils import fileio
import os
# fileio.read_content(os.path.join(settings.USER_TWEETS_DIR, '790921536.json'), 'json')
tweets_df[['id', 'full_text']].set_index('id').to_dict()['full_text']

Čestitamo!  branit će hrvatske boje u grupnoj fazi , među najboljim europskim klubovima, nakon što j…


{1574472360173998098: '@AndrewRangeley @ChrisDeMuthJr What are in his view best risk/reward pockets of the market with the recent sell off',
 1572662647367122946: 'RT @CrossTheAges: 𝑻𝑾𝑰𝑳𝑰𝑮𝑯𝑻 𝑰𝑺 𝑺𝑳𝑶𝑾𝑳𝒀 𝑭𝑨𝑫𝑰𝑵𝑮 𝑨𝑾𝑨𝒀...\n\nEarly Access is coming.\n\n🚨 RETWEET and FOLLOW this account to get a chance to win a PR…',
 1568201057402032129: '@hkuppy AFU reached outskirts of the city. Progress itself is not known',
 1536689424884449283: 'Per shareholder letter Coinbase went from 1717 to 4948 employees last year. That would mean 890 people. $COIN https://t.co/nrrKV94sk0',
 1535332169035534336: '@AR_UKR_JW @theRealWattwurm @georgian_legion @UkraineAidOps @Harri_Est @UAarmy_animals Hi Alan, do you have any info on that fundraiser for radios? Since goal was achieved what is happening with it? Are they on the way? Thank you for all your effort!',
 1518262961688793090: '@shahh @CrossTheAges @UbisoftFR @TheSandboxGame @tomach_ @ninomihovilic @hrvojebrezak',
 1516849600925216772: '@Crosstheages and @Wond

In [None]:
# retweet_tweet_ids = retweets_df.og_tweet_id.value_counts().index
retweets_df.og_tweet_id.value_counts()

In [None]:
tweets_df.langid.value_counts()[:15].plot.bar(rot=0)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(24, 8))

source_retweets_df = pd.DataFrame(retweets_df.source.value_counts())[:15].join(users_df[['user_id', 'screen_name']].set_index('user_id')).sort_values('source', ascending=False)
source_retweets_df.plot.bar(x='screen_name', y='source', rot=30, title='Most retweeted sources', ax=axs[0])

target_retweets_df = pd.DataFrame(retweets_df.target.value_counts())[:15].join(users_df[['user_id', 'screen_name']].set_index('user_id')).sort_values('target', ascending=False)
target_retweets_df.plot.bar(x='screen_name', y='target', rot=30, title='Most retweeting targets', ax=axs[1])
plt.show()

In [None]:
tweets_df[tweets_df['id'] == str(1501227789437255680)]

In [None]:
retweets_df.loc[retweets_df['source'].isin(source_retweets_df.index) | retweets_df['target'].isin(target_retweets_df.index)]

In [None]:
fig, ax = plt.subplots(figsize=(18, 15))


G = nx.from_pandas_edgelist(retweets_df.loc[retweets_df['source'].isin(source_retweets_df.index) | retweets_df['target'].isin(target_retweets_df.index)], edge_attr='rt_time_elapsed_sec')
nx.draw_spring(G, node_size=20, ax=ax, labels=user_idx)

In [None]:
G = nx.from_pandas_edgelist(retweets_df, create_using=nx.DiGraph)
dg_centrality = nx.degree_centrality(G)
dg_centrality = [{'user_id': key, 'user_name': users_df.loc[users_df['user_id'] == key].screen_name.values[0], 'centrality': value} for key, value in dg_centrality.items()]
sorted_dg_centrailty = sorted(dg_centrality, key=lambda x: x['centrality'], reverse=True)
best_dg_centrality = sorted_dg_centrailty[:10]
best_dg_centrality

In [None]:
from twitter_scraper.utils import fileio
from twitter_scraper.text.tweets import get_stemmed_text

best_dg_centrality_user_ids = [item['user_id'] for item in best_dg_centrality]

def get_central_user_tweets():
    user_tweet_records = tweets_df[tweets_df['user_id'].isin(best_dg_centrality_user_ids)][['user_id', 'full_text']].to_dict(orient='records')
    central_user_tweets = {}
    for item in user_tweet_records:
        user_id = item['user_id']
        if user_id in central_user_tweets:
            central_user_tweets[user_id].append(item['full_text'])
        else:
            central_user_tweets[user_id] = [item['full_text']]
    return central_user_tweets


central_user_tweets = get_central_user_tweets()
most_occuring_keywords = {}
for user_id in best_dg_centrality_user_ids:
    word_count = most_occuring_keywords.get(user_idx[user_id], {})
    for text in central_user_tweets[user_id]:
        text = get_stemmed_text(text)
        for word in text:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    most_occuring_keywords[user_idx[user_id]] = word_count
# fileio.write_content('most_occuring_keywords.json', most_occuring_keywords, 'json')
print(most_occuring_keywords[user_idx[user_id]])

In [None]:
tweets_df[tweets_df['user_id'].isin(best_dg_centrality_user_ids)][['user_id', 'full_text']].to_dict(orient='records')

In [None]:
import tweepy

conn_details = settings.connections['andhrelja']
auth = tweepy.OAuthHandler(conn_details['consumer_key'], conn_details['consumer_secret'])
auth.set_access_token(conn_details['access_key'], conn_details['access_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
retweets = api.get_retweets(1555473887336472576)

In [None]:
len(retweets)

In [None]:
pd.DataFrame(api.supported_languages()).sort_values('code').reset_index(drop=True)