In [1]:
import string
import re
import nltk
import pandas as pd
import ipywidgets as widgets

from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from ipywidgets import interact
from IPython.display import display, HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))
pd.set_option('display.max_columns', None)

In [49]:
eg_tweets = pd.read_csv(
    '/home/naggar/repos/twitter_accounts_analysis/data/egypt_tweets_2020/egypt_022020_tweets_csv_hashed.csv', 
    chunksize=1000000, 
    encoding='utf-8'
)

In [None]:
user_groups = []

# Group tweets by user for further analysis
for chunk in eg_tweets:
    
    group = chunk.groupby(['userid'])
    user_groups.append(group)
    
    del group

print(user_groups)

In [42]:
# Create a series of counts of retweets per user
user_ids = [group['is_retweet'].sum() for group in user_groups]

In [45]:
retweet_per_user = pd.concat(user_ids)
retweet_per_user = retweet_per_user.groupby('userid').sum()
retweet_per_user = retweet_per_user.sort_values(ascending=False)

In [51]:
retweet_per_user.to_json("reweets_count_per_account.json")

In [5]:
def check_str(chunk):
    if 'CTky7SvC51cUfDgM9ljMTPhcc2HcH84VC5ivPh+w5hM=' in chunk['tweet_text'].values:
        print("here")

In [None]:
agg_freqs = pd.Series(dtype=int)

In [3]:
# Function to remove punctuation from the text
def remove_punctuation(text):
    """
    Remove punctuation from text.
    """
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Function to read Arabic stop words from a TXT file
def read_arabic_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = {word.strip() for word in file}
    return stop_words

def preprocess_arabic_text(text):
    """
    Preprocessing pipeline for arabic text.
    remove punctuation -> normalize letters -> tokenize -> omit stopwords
    """
    
    # Remove punctuation
    text = remove_punctuation(text)

    # Normalize Arabic characters (optional, depends on the use case)
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    # text = re.sub("[^\u0600-\u06FF\s]", "", text)
    text = re.sub("[^\u0600-\u06FF\u0660-\u0669\u06F0-\u06F9\u0020-\u007E]", "", text)
    text = re.sub(r'(\w)[{0}]+(\w)'.format(string.punctuation), r'\1 \2', text)

    # Tokenize the text into words (optional, depends on the use case)
    words = nltk.word_tokenize(text)

    english_stop_words = set(stopwords.words('english'))
        
    # Load Arabic stop words
    arabic_stop_words = read_arabic_stop_words('data/stop_words_arabic.txt')

    # Combine both stop word sets
    stop_words = arabic_stop_words.union(english_stop_words)

    # Filter out stop words
    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 1]

    # Join the filtered words back into a single string
    cleaned_text = ' '.join(filtered_words)

    return cleaned_text


def get_count(chunk):
    """
    Applies preprocessing function and get word count.
    """
    # Apply Arabic text preprocessing to the 'tweet_text'
    chunk['cleaned_text'] = chunk['tweet_text'].apply(preprocess_arabic_text)

    # Perform word frequency analysis on cleaned text
    word_freq = chunk['cleaned_text'].str.split(expand=True).stack().value_counts()
    return word_freq


In [None]:
for i, chunk in enumerate(eg_tweets, start=1):
    # Apply the processing code to the current chunk
    word_freq_chunk = get_count(chunk)

    # Update the overall word frequency with the current chunk's word frequency
    agg_freqs = agg_freqs.add(word_freq_chunk, fill_value=0)

    # Free up memory after processing the chunk
    del word_freq_chunk

    print(f"Processed Chunk {i}")

# The overall_word_freq now contains the word frequency across all chunks
print(agg_freqs)

In [None]:
agg_freqs = agg_freqs.sort_values(ascending=False)

In [None]:
agg_freqs.to_json('word_counts.json')

In [40]:
@interact(num_words=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Num Words'))
def plot_wordcloud(num_words):
    top_words = agg_freqs.head(num_words)
    arabic_wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        font_path='/home/naggar/repos/twitter_accounts_analysis/data/fonts/Cairo/static/Cairo-Medium.ttf'
    )
    print(top_words.index)
    arabic_text = ' '.join(top_words.index)
    arabic_wordcloud.generate(arabic_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(arabic_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

interactive(children=(IntSlider(value=10, description='Num Words', min=5), Output()), _dom_classes=('widget-in…

In [2]:
import plotly.express as px

agg_freqs = pd.read_json('word_counts.json', typ='series')
agg_freqs = agg_freqs.drop('RT')

In [5]:
@interact(num_words=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Num Words'))
def interactive_bar_chart(num_words):
    
    top_hundred_words = agg_freqs.head(num_words).reset_index()
    top_hundred_words.columns = ['Word', 'Count']
    fig = px.bar(top_hundred_words, x='Word', y='Count', color='Count', text='Word', labels={'Count': 'Counts'},
             title='Top 100 Words and Their Counts')
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(xaxis_title='Words', yaxis_title='Counts', xaxis_tickangle=-45,
                      xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12),
                      title_font=dict(size=16))
    fig.show()

interactive(children=(IntSlider(value=10, description='Num Words', min=5), Output()), _dom_classes=('widget-in…

In [None]:
retweets_freqs = pd.read_json('reweets_count_per_account.json', typ='series')
eg_accounts = pd.read_csv('data/hashed_2020_04_egypt_022020_egypt_022020_users_csv_hashed/egypt_022020_users_csv_hashed.csv')

In [34]:
retweets_df = retweets_freqs.reset_index()
retweets_df.columns =  ['userid', 'retweets_count']
retweets_df = retweets_df.merge(eg_accounts,on='userid', how='left')
retweets_df

Unnamed: 0,userid,retweets_count,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,account_language
0,3587272518,189460,💛Smile💛,aboal3enen,فعلو التنبيهات 🔔,No Dm pls💛\nHijab Model💛\nI love moon💛,,118186,55360,2015-09-16,en
1,869891243999887360,172105,دعاء طارق⁦🇪🇬⁩,DodoTickoo1,,‏‏‏‏مصرية سيساوية ومتجوزة واسكندرانية على منوف...,,23279,1573,2017-05-31,en
2,1664006540,128649,الكايد 🇰🇼,alkad37,,إن مزحنا لأجل تخفيف الطعون ابتلشنا فالصدور الض...,,407280,372096,2013-08-12,en
3,847791625673822208,127214,خدمة اخبارك الكويتية,akhbarkkw,دولة الكويت,نقدم لكم اخبار كويتية لحظة بلحظة ، واخبار خليج...,,297882,235494,2017-03-31,en
4,azuf3RmznMPoQqkpW8WMgWA9SHn1WskzUVEvzeX8+Ac=,98904,azuf3RmznMPoQqkpW8WMgWA9SHn1WskzUVEvzeX8+Ac=,azuf3RmznMPoQqkpW8WMgWA9SHn1WskzUVEvzeX8+Ac=,Bulgaria,,,168,634,2012-03-10,bg
...,...,...,...,...,...,...,...,...,...,...,...
2167,5doKfeSTFdEbWFm5b6vGRX8ZLNQpgUpeZHhVt+5Vjpc=,0,5doKfeSTFdEbWFm5b6vGRX8ZLNQpgUpeZHhVt+5Vjpc=,5doKfeSTFdEbWFm5b6vGRX8ZLNQpgUpeZHhVt+5Vjpc=,,,,2,17,2018-07-23,en
2168,KviHxnDICK1vOMenfyq3CYsAoE8faBK9ln8ZI+p6lSo=,0,KviHxnDICK1vOMenfyq3CYsAoE8faBK9ln8ZI+p6lSo=,KviHxnDICK1vOMenfyq3CYsAoE8faBK9ln8ZI+p6lSo=,,,,0,1,2011-07-14,en
2169,L00zQb4B0O2rJ6loEPIGQrJmjWLtOw2Z0aPiF7qJVE=,0,L00zQb4B0O2rJ6loEPIGQrJmjWLtOw2Z0aPiF7qJVE=,L00zQb4B0O2rJ6loEPIGQrJmjWLtOw2Z0aPiF7qJVE=,,,,0,27,2014-07-26,mk
2170,5DMDc0CGlUDOajnFfP6BtbfbAinVYWXfI6fdR3NX15Y=,0,5DMDc0CGlUDOajnFfP6BtbfbAinVYWXfI6fdR3NX15Y=,5DMDc0CGlUDOajnFfP6BtbfbAinVYWXfI6fdR3NX15Y=,,,,0,0,2018-06-27,en


In [39]:
@interact(num_accounts=widgets.IntSlider(min=5, max=100, step=1, value=10, description='Numeber of retweets per account.'))
def interactive_bar_chart(num_accounts):
    
    top_hundred_words = pd.DataFrame(
        {'Account': retweets_df['user_display_name'].head(num_accounts),
         'Retweets Count': retweets_df['retweets_count'].head(num_accounts)}
    )
                                      
    top_hundred_words.columns = ['Account', 'Retweets Count']
    fig = px.bar(top_hundred_words, x='Account', y='Retweets Count', color='Retweets Count', text='Account', labels={'Retweets Count': 'Retweets Count'},
             title=f'Top {num_accounts} most retweeting accounts.')
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    fig.update_layout(xaxis_title='Accounts', yaxis_title='Retweets Counts', xaxis_tickangle=-45,
                      xaxis_tickfont=dict(size=12), yaxis_tickfont=dict(size=12),
                      title_font=dict(size=16))
    fig.show()

interactive(children=(IntSlider(value=10, description='Numeber of retweets per account.', min=5), Output()), _…

In [None]:
import networkx as nx
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Initialize Plotly in Jupyter Notebook
init_notebook_mode(connected=True)

# Sample data: Edges represent interactions between accounts
edges = [
    ('user1', 'user2'),
    ('user2', 'user3'),
    ('user3', 'user1'),
    ('user4', 'user1'),
    ('user4', 'user2'),
]

# Create a directed graph from the edges
G = nx.DiGraph()
G.add_edges_from(edges)

# Define layout
layout = nx.spring_layout(G)

# Create nodes and edges for Plotly visualization
node_trace = go.Scatter(
    x=[layout[k][0] for k in G.nodes()],
    y=[layout[k][1] for k in G.nodes()],
    text=list(G.nodes()),
    mode='markers+text',
    hoverinfo='text',
    marker=dict(size=20),  # Increase the size of the nodes
)

edge_trace = go.Scatter(
    x=[layout[e[0]][0] for e in G.edges()],
    y=[layout[e[0]][1] for e in G.edges()],
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines',
)

# Create Plotly figure
fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        showlegend=False,
        hovermode='closest',
        margin=dict(b=0, l=0, r=0, t=0),
    ),
)

# Display the interactive visualization in the Jupyter Notebook
iplot(fig)


In [None]:
graph = nx.Graph()

In [None]:
graph.add_nodes_from([(1, {"color":"red"})])

In [None]:
agg_freqs[180:190]

In [None]:
accounts.head()

In [None]:
accounts['user_profile_url'].notna()

In [None]:
accounts['user_profile_url'].loc[accounts['user_profile_url'] != NaN]

In [None]:
accounts.loc[accounts['account_creation_date'] >= "2019-01-01"].count()

In [None]:
eg_acc = pd.read_csv('data/hashed_2020_04_egypt_022020_egypt_022020_users_csv_hashed/egypt_022020_users_csv_hashed.csv')

In [None]:
eg_acc.loc[eg_acc['userid'] =='CTky7SvC51cUfDgM9ljMTPhcc2HcH84VC5ivPh+w5hM']