In [147]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

# Load Data

Data is prepared by the authors of [this](https://arxiv.org/pdf/2307.16336.pdf) paper and is provided [here](https://www.dropbox.com/scl/fo/l49jls7vvz4tgbnv9drnf/h?rlkey=yu44uqhziglxzsjkc5l7cu0uh&dl=0).

In [2]:
df = pd.read_json('./data/fox8_23_dataset.ndjson', lines=True)
df.head()

Unnamed: 0,user_id,label,dataset,user_tweets
0,16905397,human,botometer-feedback,"[{'contributors': None, 'truncated': True, 'te..."
1,2717053344,human,botometer-feedback,"[{'contributors': None, 'truncated': False, 't..."
2,297051227,human,botometer-feedback,"[{'contributors': None, 'truncated': False, 't..."
3,282275320,human,botometer-feedback,"[{'contributors': None, 'truncated': False, 't..."
4,1663020151,human,botometer-feedback,"[{'contributors': None, 'truncated': False, 't..."


The format of the data is described [here](https://github.com/osome-iu/AIBot_fox8) where the `user_tweets` column contains tweet objects that follow [this](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet) schema.

In [3]:
df_exploded = df.explode('user_tweets')
df_exploded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368028 entries, 0 to 2279
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      368028 non-null  int64 
 1   label        368028 non-null  object
 2   dataset      368028 non-null  object
 3   user_tweets  368028 non-null  object
dtypes: int64(1), object(3)
memory usage: 14.0+ MB


# Explode Data

In [118]:
rel_tweet_fields = ['created_at', 'text', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'is_quote_status']

df_exploded['user_tweets'] = df_exploded['user_tweets'].map(lambda tweet: {field: tweet[field] for field in rel_tweet_fields})
df_user_tweets_exploded = df_exploded['user_tweets'].apply(pd.Series)

df_full_exploded = pd.concat([df_exploded, df_user_tweets_exploded], axis=1).drop('user_tweets', axis=1)
df_full_exploded['created_at'] = pd.to_datetime(df_full_exploded['created_at'])
df_full_exploded.head()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,user_id,label,dataset,created_at,text,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,favorite_count,favorited,retweeted,is_quote_status
0,16905397,human,botometer-feedback,2018-04-18 14:09:01+00:00,@christophkoeck In Diskussionen in D erlebe ic...,986607535674322945,9.864804e+17,927857246.0,0,0,False,False,False
0,16905397,human,botometer-feedback,2018-04-18 13:59:28+00:00,"Zwei Lehrerinnen, zwölf Schüler und ein paar S...",986605134649081858,,,0,1,False,False,False
0,16905397,human,botometer-feedback,2018-04-18 13:45:14+00:00,Antisemitismus in Deutschland =&gt; Adam Armus...,986601549194579968,,,2,1,False,False,False
0,16905397,human,botometer-feedback,2018-04-18 08:35:02+00:00,Bayerische Politik und ihr Menschenbild – Das ...,986523487807426560,,,1,0,False,False,False
0,16905397,human,botometer-feedback,2018-04-18 05:08:45+00:00,Es kommt doch auf den Lehrer an! #EDchatDE #Tw...,986471574818689024,9.864714e+17,16905397.0,0,0,False,False,False


# EDA

Understand difference in # of tweets between bots and humans.

In [142]:
fig = px.bar(df_full_exploded.groupby('label')['text'].count())
fig.update_layout(yaxis_title='# Tweets', xaxis_title='Label')
fig.write_html("./plots/tweets_comparison_1.html")

In [140]:
num_tweets = df_full_exploded.groupby(['label', 'user_id'])['text'].count().to_frame().reset_index()
fig = px.violin(num_tweets, y="label", x="text", box=True, points='all')
fig.update_layout(xaxis_title='# Tweets', yaxis_title='Source')
fig.write_html("./plots/tweets_comparison.html")

Understand difference in # of likes between bots and humans.

In [141]:
num_favs = df_full_exploded.groupby(['label', 'user_id'])['favorite_count'].sum().to_frame().reset_index()
fig = px.violin(num_favs, y="label", x="favorite_count", box=True, points="all")
fig.update_layout(xaxis_title='# Likes', yaxis_title='Source')
fig.write_html("./plots/likes_comparison.html")

Build graph user interactions. View the graph on [CosmoGraph](https://cosmograph.app/).

In [13]:
user_interaction_edge_list = df_full_exploded.groupby(['user_id', 'in_reply_to_user_id'])['text'].count().to_frame().reset_index()
user_interaction_edge_list['in_reply_to_user_id'] = user_interaction_edge_list['in_reply_to_user_id'].astype(int)
user_interaction_edge_list = user_interaction_edge_list.rename(columns={'user_id': 'from', 'in_reply_to_user_id': 'to', 'text': 'num_replies'})
user_interaction_edge_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91689 entries, 0 to 91688
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   from         91689 non-null  int64
 1   to           91689 non-null  int64
 2   num_replies  91689 non-null  int64
dtypes: int64(3)
memory usage: 2.1 MB


In [14]:
user_list = df_full_exploded.groupby(['user_id', 'label']).agg({'text': 'count', 'retweet_count': 'sum', 'favorite_count': 'sum'}).reset_index()
user_list['user_id'] = user_list['user_id'].astype(int)
user_list['label'] = user_list['label'].map(lambda x: 1 if x == 'human' else 0).astype(int)
user_list = user_list.merge(pd.Series(user_interaction_edge_list['to'].unique()).to_frame(), left_on='user_id', right_on=0, how='outer').drop(0, axis=1)
user_list['label'] = user_list['label'].fillna(0.5)
user_list = user_list.rename(columns={'user_id': 'id'})
user_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48983 entries, 0 to 48982
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              48983 non-null  int64  
 1   label           48983 non-null  float64
 2   text            2279 non-null   float64
 3   retweet_count   2279 non-null   float64
 4   favorite_count  2279 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 1.9 MB


In [55]:
user_interaction_edge_list.to_csv('./data/user_edge_list.csv', index=False)
user_list.to_csv('./data/user_node_list.csv', index=False)

Analyze user interactions.

In [94]:
interactions = df_full_exploded.merge(user_list, left_on='in_reply_to_user_id', right_on='id', how='left')[['created_at', 'user_id', 'label_x', 'text_x', 'id_y', 'label_y']]
interactions = interactions.rename(columns={'user_id': 'post_id', 'label_x': 'post_label', 'text_x': 'text', 'id_y': 'reply_id', 'label_y': 'reply_label'})

print(f"Proportion of Tweets that Are Replies: {interactions[~interactions['reply_id'].isna()].shape[0] / interactions.shape[0]}")
print()
print(f"Proportion of Replies that Are Humans: {interactions[~interactions['reply_id'].isna()]['post_label'].value_counts()['human'] / interactions[~interactions['reply_id'].isna()].shape[0]}")
print(f"Proportion of Humans that Replied to Bots: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human') & (interactions['reply_label'] == 0)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human')].shape[0]}")
print(f"Proportion of Humans that Replied to Humans: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human') & (interactions['reply_label'] == 1)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human')].shape[0]}")
print(f"Proportion of Humans that Replied to Unkown Entities: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human') & (interactions['reply_label'] == 0.5)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'human')].shape[0]}")
print()
print(f"Proportion of Replies that Are Bots: {interactions[~interactions['reply_id'].isna()]['post_label'].value_counts()['bot'] / interactions[~interactions['reply_id'].isna()].shape[0]}")
print(f"Proportion of Bots that Replied to Humans: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot') & (interactions['reply_label'] == 1)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot')].shape[0]}")
print(f"Proportion of Bots that Replied to Bots: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot') & (interactions['reply_label'] == 0)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot')].shape[0]}")
print(f"Proportion of Bots that Replied to Unkown Entities: {interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot') & (interactions['reply_label'] == 0.5)].shape[0] / interactions[~interactions['reply_id'].isna() & (interactions['post_label'] == 'bot')].shape[0]}")


Proportion of Tweets that Are Replies: 0.34577533806254396

Proportion of Replies that Are Humans: 0.4192720192202001
Proportion of Humans that Replied to Bots: 0.0
Proportion of Humans that Replied to Humans: 0.0828636167862025
Proportion of Humans that Replied to Unkown Entities: 0.9171363832137975

Proportion of Replies that Are Bots: 0.5807279807797999
Proportion of Bots that Replied to Humans: 0.0002974379774217535
Proportion of Bots that Replied to Bots: 0.03424592712769553
Proportion of Bots that Replied to Unkown Entities: 0.9654566348948828


Understand common words used in bot tweets.

In [138]:
stopwords = STOPWORDS.union({'https', 't', 'co', 'take', 'to', 'be', 'in', 'the', 'to see', 'in the', 'to be', 'to hear', 'for the'})
wc = WordCloud(width=800, height=500, stopwords=stopwords)\
    .generate(' '.join(df_full_exploded.loc[df_full_exploded['label'] == 'bot', 'text'].values))
plt.figure(figsize=(20, 10))
plt.imshow(wc)
plt.axis('off')
plt.savefig('./plots/bot_tweet_wc.png')
plt.close()

Understand common words in human tweets.

In [137]:
stopwords = STOPWORDS.union({'https', 't', 'co', 'take', 'https t', 't co'})
wc = WordCloud(width=800, height=500, stopwords=stopwords)\
    .generate(' '.join(df_full_exploded.loc[df_full_exploded['label'] == 'human', 'text'].values))
plt.figure(figsize=(20, 10))
plt.imshow(wc)
plt.axis('off')
plt.savefig('./plots/human_tweet_wc.png')
plt.close()

Analyze tweet activity over time.

In [131]:
fig = px.line(df_full_exploded.groupby(['label', 'created_at'])['text'].count().to_frame().reset_index(), x="created_at", y="text", color="label", symbol="label")
fig = fig.update_layout(xaxis_title='Date', yaxis_title='# Tweets')
fig.write_html("./plots/tweet_activity_time.html")

Analyze tokens.

In [162]:
num_toks = df_full_exploded.groupby(['label', 'user_id'])['num_tokens'].mean().to_frame().reset_index()
fig = px.violin(num_toks, y="label", x="num_tokens", box=True, points='all')
fig.update_layout(xaxis_title='# Tokens', yaxis_title='Source')
fig.write_html("./plots/tok_comparison.html")

### Next Steps
- Ask ChatGPT to provide classification and confidence on whether tweet is bot or human generated: https://chat.openai.com/share/0d1ce518-2043-4589-92e9-2abf0e4414b5.
- Use open source ai-content detector on HF: https://huggingface.co/openai-community/roberta-base-openai-detector?text=I+like+you.+I+love+you.