Read original tweet dataset:

In [1]:
import json
import bz2

sfile = '../data/monkeypox_tweets_081222.bz2'
    
# Open bz2 and read JSON to test
with bz2.BZ2File(sfile, 'r') as f:
    tweets = []
    for line in f.readlines():
        tweet = json.loads(line.decode().strip("\n"))
        tweets = tweets + tweet

print(len(tweets), "tweets read")

230163 tweets read


In [153]:
import re
import pandas as pd

# Remove non-english tweets
tweets = pd.DataFrame(tweets)
tweets = tweets.loc[tweets['lang'] == 'en']

# Remove tweets that don't contain keywords: "monkey pox", "monkeypox", "moneypox"
query_terms = ["monkey pox", "monkeypox", "moneypox"]
pattern = '|'.join(query_terms)
tweets = tweets[tweets['full_text'].str.contains(pattern, case=False)]

# Generate 10% random sample
sampled = tweets.sample(frac=0.1, random_state=0).reset_index(drop=True)
sampled['date'] = sampled['created_at'].apply(lambda x: x[:7])

print(len(sampled))

19695


Bot testing:

In [42]:
sampled['screen_name'] = sampled['user'].apply(lambda x:x['screen_name'])
len(set(sampled['screen_name']))

16333

In [1]:
""" Check by handle """

import botometer
import os

# Note that the config.py file must be set up with your twitter and botometer access tokens in order to run this code
import config


# Enter your own Twitter API access information to use this function
twitter_app_auth = config.auth_setup()

# twitter_app_auth = {"Authorization": "Bearer {}".format(BEARER_TOKEN)}
bom = botometer.Botometer(wait_on_ratelimit=True,
                          rapidapi_key=private_config.RapidAPI_Key,
                          **twitter_app_auth)

# Check a single account by screen name
result = bom.check_account('brooke_sheltonn')
print(result)

{'cap': {'english': 0.7952555385005927, 'universal': 0.7828265255249504}, 'display_scores': {'english': {'astroturf': 0.4, 'fake_follower': 1.2, 'financial': 1.2, 'other': 1.8, 'overall': 2.0, 'self_declared': 0.0, 'spammer': 0.6}, 'universal': {'astroturf': 0.4, 'fake_follower': 0.3, 'financial': 0.6, 'other': 1.4, 'overall': 1.4, 'self_declared': 0.0, 'spammer': 0.0}}, 'raw_scores': {'english': {'astroturf': 0.07, 'fake_follower': 0.24, 'financial': 0.25, 'other': 0.35, 'overall': 0.41, 'self_declared': 0.0, 'spammer': 0.11}, 'universal': {'astroturf': 0.08, 'fake_follower': 0.06, 'financial': 0.12, 'other': 0.28, 'overall': 0.29, 'self_declared': 0.0, 'spammer': 0.01}}, 'user': {'majority_lang': 'en', 'user_data': {'id_str': '487022346', 'screen_name': 'brooke_sheltonn'}}}


Cleaning:

In [154]:
# remove URL
sampled['text_proc'] = sampled['full_text'].str.replace(r'http(\S)+', r'')
sampled['text_proc'] = sampled['text_proc'].str.replace(r'http ...', r'')
sampled['text_proc'] = sampled['text_proc'].str.replace(r'http', r'')
sampled[sampled['text_proc'].str.contains(r'http')]

# remove query terms
sampled['text_proc'] = sampled['text_proc'].str.replace(r'monkey pox', r'', case=False)
sampled['text_proc'] = sampled['text_proc'].str.replace(r'money pox', r'', case=False)
sampled['text_proc'] = sampled['text_proc'].str.replace(r'monkeypox', r'', case=False)

# remove words beginning with @ (usernames)
sampled['text_proc'] = sampled['text_proc'].str.replace(r'@[\S]+',r'')

# remove retweets from sample
sampled = sampled[~sampled['text_proc'].str.startswith('RT')]
print(len(sampled))

# remove non-ascii words and characters
sampled['text_proc'] = [''.join([i if ord(i) < 128 else '' for i in text]) for text in sampled['text_proc']]
sampled['text_proc'] = sampled['text_proc'].str.replace(r'_[\S]?',r'')

# remove &, < and >
sampled['text_proc'] = sampled['text_proc'].str.replace(r'&amp;?',r'and')
sampled['text_proc'] = sampled['text_proc'].str.replace(r'&lt;',r'<')
sampled['text_proc'] = sampled['text_proc'].str.replace(r'&gt;',r'>')

# remove extra space
sampled['text_proc'] = sampled['text_proc'].str.replace(r'[ ]{2, }',r' ')

# lower case and strip white spaces at both ends
sampled['text_proc'] = sampled['text_proc'].str.lower()
sampled['text_proc'] = sampled['text_proc'].str.strip()

# calculate tweet length and remove tweets with fewer than 3 words
sampled['text_proc_length'] = [len(text.split(' ')) for text in sampled['text_proc']]
sampled['text_proc_length'].value_counts()

sampled = sampled[sampled['text_proc_length']>3]
sampled = sampled.drop_duplicates(subset=['text_proc'])

sampled = sampled.reset_index()

print(len(sampled))

  
  This is separate from the ipykernel package so we can avoid doing imports until
  del sys.path[0]


19689




17646


In [156]:
sampled['text_proc'][0]

'election time.  get the tp back on face.  i laugh watching people alone in car windows down with mask.   time.   ill pick d for dumb to solve puzzle'

In [157]:
sampled.to_pickle("/l/mesur/aedinge/monkeypox_twitter/010sample_083022.pkl")