In [27]:
# Change workdir
%cd /home/naggar/repos/Optimus

import pandas as pd
import ipywidgets as widgets

import matplotlib.pyplot as plt
from wordcloud import WordCloud

from ipywidgets import interact
from IPython.display import display, HTML

from utils.processor import ArProcessor
from utils.timeless import TimeLess
from utils.logger import logger

display(HTML("<style>pre { white-space: pre !important; }</style>"))
pd.set_option('display.max_columns', None)

/home/naggar/repos/Optimus


In [2]:
dtypes = {'in_reply_to_userid':'str', 'retweet_userid':'str'}
eg_tweets = pd.read_csv(
    '/home/naggar/repos/twitter_accounts_analysis/data/egypt_tweets_2020/egypt_022020_tweets_csv_hashed.csv', dtype=dtypes,
    chunksize=500000, 
    encoding='utf-8'
)

In [None]:
user_groups = []

# Group tweets by user for further analysis
for chunk in eg_tweets:
    
    group = chunk.groupby(['userid'])
    user_groups.append(group)
    
    del group

print(user_groups)

In [None]:
# Create a series of counts of retweets per user
user_ids = [group['is_retweet'].sum() for group in user_groups]

In [None]:
retweet_per_user = pd.concat(user_ids)
retweet_per_user = retweet_per_user.groupby('userid').sum()
retweet_per_user = retweet_per_user.sort_values(ascending=False)

In [None]:
retweet_per_user.to_json("reweets_count_per_account.json")

In [None]:
def check_str(chunk):
    if 'CTky7SvC51cUfDgM9ljMTPhcc2HcH84VC5ivPh+w5hM=' in chunk['tweet_text'].values:
        print("here")

In [3]:
agg_freqs = pd.Series(dtype=int)

In [3]:
processor = ArProcessor()
timey = TimeLess()

## Text Processing: extracting word counts from tweets
### Serial Execution: 30 min.

In [None]:
timey.start()

for i, chunk in enumerate(eg_tweets, start=1):
    # Apply the processing code to the current chunk
    word_freq_chunk = processor.get_count(chunk, "tweet_text")

    # Update the overall word frequency with the current chunk's word frequency
    agg_freqs = agg_freqs.add(word_freq_chunk, fill_value=0)

    # Free up memory after processing the chunk
    del word_freq_chunk

    print(f"Processed Chunk {i}")

timey.end()

### Parallel execution: 7 mins. 48 sec.

In [4]:
from concurrent.futures import ProcessPoolExecutor

In [5]:
def exc_parallel(chunk, i):
    logger.info("Started processing chunk %s", i)
    word_freq_chunk = processor.get_count(chunk, "tweet_text")
    logger.info("Finished processing chunk %s", i)
    return word_freq_chunk

In [6]:
timey.start()
max_workers = 4
futures = []

with ProcessPoolExecutor(max_workers=max_workers) as exc:
    
    for i, chunk in enumerate(eg_tweets, start=1):
        
        if len(futures) == max_workers:
            # Don't feed the executor more than for processes
            while sum(1 for future in futures if future.done()) < max_workers:
                pass
        futures.append(exc.submit(exc_parallel, chunk, i))

timey.end()

2023-08-20 02:14:27,787 - INFO - Started recording
2023-08-20 02:14:31,492 - INFO - Started processing chunk 1
2023-08-20 02:14:34,615 - INFO - Started processing chunk 2
2023-08-20 02:14:37,707 - INFO - Started processing chunk 3
2023-08-20 02:14:40,792 - INFO - Started processing chunk 4
2023-08-20 02:16:25,012 - INFO - Finished processing chunk 1
2023-08-20 02:16:27,615 - INFO - Finished processing chunk 2
2023-08-20 02:16:29,571 - INFO - Finished processing chunk 3
2023-08-20 02:16:32,588 - INFO - Finished processing chunk 4
2023-08-20 02:16:36,500 - INFO - Started processing chunk 5
2023-08-20 02:16:39,555 - INFO - Started processing chunk 6
2023-08-20 02:16:42,624 - INFO - Started processing chunk 7
2023-08-20 02:16:45,722 - INFO - Started processing chunk 8
2023-08-20 02:18:27,555 - INFO - Finished processing chunk 5
2023-08-20 02:18:29,088 - INFO - Started processing chunk 9
2023-08-20 02:18:30,878 - INFO - Finished processing chunk 6
2023-08-20 02:18:32,346 - INFO - Started pr

472.11013650894165

In [None]:
agg_freqs = agg_freqs.sort_values(ascending=False)

In [None]:
agg_freqs.to_json('word_counts.json')

In [None]:
@interact(num_words=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Num Words'))
def plot_wordcloud(num_words):
    top_words = agg_freqs.head(num_words)
    arabic_wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        font_path='/home/naggar/repos/twitter_accounts_analysis/data/fonts/Cairo/static/Cairo-Medium.ttf'
    )
    print(top_words.index)
    arabic_text = ' '.join(top_words.index)
    arabic_wordcloud.generate(arabic_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(arabic_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
import plotly.express as px

agg_freqs = pd.read_json('word_counts.json', typ='series')
agg_freqs = agg_freqs.drop('RT')

In [None]:
@interact(num_words=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Num Words'))
def interactive_bar_chart(num_words):
    
    top_hundred_words = agg_freqs.head(num_words).reset_index()
    top_hundred_words.columns = ['Word', 'Count']
    fig = px.bar(top_hundred_words, x='Word', y='Count', color='Count', text='Word', labels={'Count': 'Counts'},
             title='Top 100 Words and Their Counts')
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(xaxis_title='Words', yaxis_title='Counts', xaxis_tickangle=-45,
                      xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12),
                      title_font=dict(size=16))
    fig.show()

In [None]:
retweets_freqs = pd.read_json('reweets_count_per_account.json', typ='series')
eg_accounts = pd.read_csv('data/hashed_2020_04_egypt_022020_egypt_022020_users_csv_hashed/egypt_022020_users_csv_hashed.csv')

In [None]:
retweets_df = retweets_freqs.reset_index()
retweets_df.columns =  ['userid', 'retweets_count']
retweets_df = retweets_df.merge(eg_accounts,on='userid', how='left')
retweets_df

In [None]:
@interact(num_accounts=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Numeber of retweets per account.'))
def interactive_bar_chart(num_accounts):
    
    top_hundred_words = pd.DataFrame(
        {'Account': retweets_df['user_display_name'].head(num_accounts),
         'Retweets Count': retweets_df['retweets_count'].head(num_accounts)}
    )
                                      
    top_hundred_words.columns = ['Account', 'Retweets Count']
    fig = px.bar(top_hundred_words, x='Account', y='Retweets Count', color='Retweets Count', text='Account', labels={'Retweets Count': 'Retweets Count'},
             title=f'Top {num_accounts} most retweeting accounts.')
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(xaxis_title='Accounts', yaxis_title='Retweets Counts', xaxis_tickangle=-45,
                      xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12),
                      title_font=dict(size=16))
    fig.show()

In [None]:
import networkx as nx
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Initialize Plotly in Jupyter Notebook
init_notebook_mode(connected=True)

# Sample data: Edges represent interactions between accounts
edges = [
    ('user1', 'user2'),
    ('user2', 'user3'),
    ('user3', 'user1'),
    ('user4', 'user1'),
    ('user4', 'user2'),
]

# Create a directed graph from the edges
G = nx.DiGraph()
G.add_edges_from(edges)

# Define layout
layout = nx.spring_layout(G)

# Create nodes and edges for Plotly visualization
node_trace = go.Scatter(
    x=[layout[k][0] for k in G.nodes()],
    y=[layout[k][1] for k in G.nodes()],
    text=list(G.nodes()),
    mode='markers+text',
    hoverinfo='text',
    marker=dict(size=20),  # Increase the size of the nodes
)

edge_trace = go.Scatter(
    x=[layout[e[0]][0] for e in G.edges()],
    y=[layout[e[0]][1] for e in G.edges()],
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines',
)

# Create Plotly figure
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        showlegend=False,
        hovermode='closest',
        margin=dict(b=0, l=0, r=0, t=0),
    ),
)

# Display the interactive visualization in the Jupyter Notebook
iplot(fig)
