# Manual Tweets Collector

In [1]:
import os, re, json, time
import tweepy
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# ------------ Helper function for creating timestamp ------------
def today(backward_days: int):
    d = datetime.utcnow() - timedelta(days=backward_days)
    date = datetime(year=d.year, month=d.month, day=d.day, hour=0, minute=0, second=0)
    return date

def file_timestamp(datetime):
    return f"{datetime.strftime('%y')}{datetime.month:02}{datetime.day:02}"

In [3]:
class TweetsCollector:

    # bearer_token = os.getenv('TWITTER_BEARER_TOKEN')
    bearer_token = os.getenv('TWITTER_BEARER_TOKEN_2')  # Backup token
    tweet_fields = ['created_at']


    def __init__(self):
        self.client = tweepy.Client(self.bearer_token, wait_on_rate_limit=True)

    def _anonymise_data(self, content: str) -> str:
        # pattern_username = r"(?<![\w@!#$%&*])(@\w{1,15})\b"  # Match '@username'
        # pattern_url = r"(?:https://|http://)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,6}\b[-a-zA-Z0-9\(\)@:%_\+.~#?&//=]*"
        
        # usernames = re.findall(pattern_username, content)
        # for i, name in enumerate(usernames):
        #     alias = f'USERNAME_{(i+1):02}'
        #     content = re.sub(name, alias, content)
        
        # urls = re.findall(pattern_url, content)
        # for i, url in enumerate(urls):
        #     alias = f'URL_{(i+1):02}'
        #     content = re.sub(url, alias, content)
        
        # return content
        ...
    
    def anonymise_tweets_list(self, tweets_list: str) -> str:
        pattern_username = r"(?<![\w@!#$%&*])(@\w{1,15})\b"  # Match '@username'
        pattern_url = r"(?:https://|http://)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&//=]*"

        new_list = []
        for index, tweet in enumerate(tweets_list):
            content = tweet['content']
            
            # print(f"{index}: {content}", end="\r")
            try:
                usernames = re.findall(pattern_username, content)
                for i, name in enumerate(usernames):
                    alias = f'USERNAME_{(i+1):02}'
                    content = re.sub(re.escape(name), alias, content)   # Make sure to escape the string before using it in a regular expression
                
                urls = re.findall(pattern_url, content)
                for i, url in enumerate(urls):
                    alias = f'URL_{(i+1):02}'
                    content = re.sub(re.escape(url), alias, content)
            except Exception as e:
                print(f"Error occurs in: index [{index}] :\n{tweet['content']}", end="\r")
                raise


            new_tweet = {
                "creation_date": tweet["creation_date"],
                "content": content
            }

            new_list.append(new_tweet)
        
        return new_list

    def count_tweet(self, query, mute=False):
        counts = self.client.get_recent_tweets_count(query=query, granularity='day')

        if mute:
            return counts

        str_print = ''
        total_count = counts.meta['total_tweet_count']

        for count in counts.data:
            start_time = re.search('\d{4}-(\d{2}-\d{2})', count['start']).group(1)
            end_time = re.search('\d{4}-(\d{2}-\d{2})', count['end']).group(1)
            str_print += f"{start_time} => {end_time} :  {count['tweet_count']}\n"
        str_print = f"Average: {total_count/7:.0f}/day\nTotal : {total_count} in 7 days\n\n" + str_print
        print(str_print)

        return counts

    def limit_handler(self, paginator):
        while True:
            try:
                yield next(paginator)
            except tweepy.errors.TooManyRequests:
                print('\nReached rate limite. Sleeping for >15 minutes')
                time.sleep(15 * 61)
            except StopIteration:
                break

    def search_tweets_pagination(self, query: str, num: int, start_date, end_date):
        tweets = self.limit_handler(
            tweepy.Paginator(
                self.client.search_recent_tweets, 
                query=query, 
                max_results=100, # max limit: 100
                tweet_fields=self.tweet_fields,
                start_time=start_date,
                end_time=end_date,
            ).flatten(limit=num)
        )
        
        return tweets
    
    def convert_tweets_to_dataframe(self, tweets: tweepy.Response) -> pd.DataFrame:
    #     tweets_list = []
    #     # with pagination: for tweet in tweets
    #     # without pagination: for tweet in tweets.data
    #     for tweet in tweets:
    #         set_tweet_data= {
    #             'created_at': tweet.created_at,
    #             'text': self._anonymise_data(tweet.text)
    #         }
    #         tweets_list.append(set_tweet_data)
        
    #     df = pd.DataFrame(tweets_list)
    #     return df
        ...


    def convert_tweets_to_list_of_dict(self, tweets: tweepy.Response) -> list:
        tweets_list = []
        # with pagination:     for tweet in tweets
        # without pagination:  for tweet in tweets.data
        for i, tweet in enumerate(tweets):
            tweet_dict = {
                "creation_date": tweet.data["created_at"],
                "content": tweet.data["text"]
            }
            tweets_list.append(tweet_dict)
            print(f'Current number: {i+1}', end='\r')
        print('\n\033[32mConvertion successfully finished!\033[0m')
        # tweets_json = json.dumps(tweets_list, indent=2)
        return tweets_list

collector = TweetsCollector()

query = "(covid OR covid19 OR covid-19 OR coronavirus OR (corona virus) OR pandemic) -is:retweet lang:en"

### Count tweets

In [4]:
backward_days = 1   # Set to 1 to collect yesterday's (Max: 6)
collection_date = today(backward_days)
start_time = collection_date
end_time = today(backward_days) + timedelta(days=1)
print(f"The collection date: '{collection_date.strftime('%Y-%m-%dT%H:%M:%SZ')}' -> \033[32m{backward_days} day(s) ago\033[0m")

The collection date: '2023-03-23T00:00:00Z' -> [32m1 day(s) ago[0m


In [5]:
# -------------- Count tweets --------------
tweet_counts = collector.count_tweet(query)

counts_in_hour = collector.client.get_recent_tweets_count(query=query, granularity='hour', start_time=start_time, end_time=end_time)

Average: 101817/day
Total : 712718 in 7 days

03-08 => 03-09 :  20835
03-09 => 03-10 :  109834
03-10 => 03-11 :  105635
03-11 => 03-12 :  104148
03-12 => 03-13 :  96332
03-13 => 03-14 :  97153
03-14 => 03-15 :  99752
03-15 => 03-15 :  79029



In [6]:
counts = []
for data in reversed(counts_in_hour.data):
    start = data['start']
    end = (datetime.strptime(data['end'], '%Y-%m-%dT%H:%M:%S.000Z') - timedelta(seconds=1)).strftime('%Y-%m-%dT%H:%M:%S.000Z')
    counts.append({
        "end": end,
        "start": start,
        "tweet_count": data['tweet_count']
    })

expected_num = np.sum([hour['tweet_count'] // 2 for hour in counts])
print(f"Date: {counts_in_hour.data[-1]['start']}\
\nTotal number of tweets: {counts_in_hour.meta['total_tweet_count']}\
\nExpected collected number: {expected_num}")

Date: 2023-03-14T23:00:00.000Z
Total number of tweets: 99752
Expected collected number: 49871


### Search Tweets

In [7]:
tweets_list = []
for hour in counts:
    start = hour['start']
    end = hour['end']
    num = hour['tweet_count'] // 2  # Get 50%
    print(f"----- {start} -----")
    generator = collector.search_tweets_pagination(query, num, start, end)
    tweets = collector.convert_tweets_to_list_of_dict(generator)
    tweets_list.extend(tweets)

----- 2023-03-14T23:00:00.000Z -----
Current number: 2102
[32mConvertion successfully finished![0m
----- 2023-03-14T22:00:00.000Z -----
Current number: 2308
[32mConvertion successfully finished![0m
----- 2023-03-14T21:00:00.000Z -----
Current number: 2392
[32mConvertion successfully finished![0m
----- 2023-03-14T20:00:00.000Z -----
Current number: 2766
[32mConvertion successfully finished![0m
----- 2023-03-14T19:00:00.000Z -----
Current number: 2575
[32mConvertion successfully finished![0m
----- 2023-03-14T18:00:00.000Z -----
Current number: 2563
[32mConvertion successfully finished![0m
----- 2023-03-14T17:00:00.000Z -----
Current number: 2719
[32mConvertion successfully finished![0m
----- 2023-03-14T16:00:00.000Z -----
Current number: 2778
[32mConvertion successfully finished![0m
----- 2023-03-14T15:00:00.000Z -----
Current number: 2728
[32mConvertion successfully finished![0m
----- 2023-03-14T14:00:00.000Z -----
Current number: 2787
[32mConvertion successfully fini

Rate limit exceeded. Sleeping for 675 seconds.


Current number: 1712
[32mConvertion successfully finished![0m
----- 2023-03-14T02:00:00.000Z -----
Current number: 1952
[32mConvertion successfully finished![0m
----- 2023-03-14T01:00:00.000Z -----
Current number: 2082
[32mConvertion successfully finished![0m
----- 2023-03-14T00:00:00.000Z -----
Current number: 2128
[32mConvertion successfully finished![0m


In [29]:
anonymous_tweets_list = collector.anonymise_tweets_list(tweets_list)

### Store Results

In [30]:
# path = './data/raw/'
path = "/home/p11333at/nlp-project/data/raw/"
filename = f"tweets_{file_timestamp(start_time)}_#{len(tweets_list)}.json"

with open(f"{path}{filename}", "w") as f:
    for line in anonymous_tweets_list:
        json.dump(line, f)
        f.write('\n')

if os.path.exists(f"{path}{filename}"):
    print(f"The file '{filename}' is created.")

The file 'tweets_230314_#49871.json' is created.


# Meta Data Recorder

### Initialisation

In [5]:
collector = TweetsCollector()

query = "(covid OR covid19 OR covid-19 OR coronavirus OR (corona virus) OR pandemic) -is:retweet lang:en"

day_counts = collector.client.get_recent_tweets_count(query=query, granularity='day')


### Record tweets count for **each day**

In [6]:
output = {}
for i, day in enumerate(day_counts.data):
    if i == 0 or i == len(day_counts.data)-1:
        continue
    date = re.search(r"\d{4}-\d{2}-\d{2}", day["start"]).group(0)
    count = day["tweet_count"]
    output.update(
        {date : 
            {"total" : count,
             "hourly" : {}}
        }
    )
output

{'2023-03-18': {'total': 91842, 'hourly': {}},
 '2023-03-19': {'total': 78682, 'hourly': {}},
 '2023-03-20': {'total': 96013, 'hourly': {}},
 '2023-03-21': {'total': 99584, 'hourly': {}},
 '2023-03-22': {'total': 107155, 'hourly': {}},
 '2023-03-23': {'total': 98178, 'hourly': {}}}

### Record tweets count for **each hour**

In [7]:
# backward_days = 1   # Set to 1 to collect yesterday's (Max: 6) 
# collection_date = today(backward_days)
# start_time = collection_date
# end_time = today(backward_days) + timedelta(days=1, seconds=-1)
# print(f"The date: '{collection_date.strftime('%Y-%m-%dT%H:%M:%SZ')}' -> \033[32m{backward_days} day(s) ago\033[0m")

start_time = today(6)
end_time = today(1) + timedelta(days=1)
print(f"The start time : '{start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}'")
print(f"The  end  time : '{end_time.strftime('%Y-%m-%dT%H:%M:%SZ')}'")

hour_counts = collector.client.get_recent_tweets_count(query=query, granularity='hour', start_time=start_time, end_time=end_time)


The start time : '2023-03-18T00:00:00Z'
The  end  time : '2023-03-24T00:00:00Z'


In [8]:
for hour in hour_counts.data:
    start = hour["start"]
    date = re.search(r"\d{4}-\d{2}-\d{2}", start).group(0)
    time = re.search(r"\d{2}:\d{2}:\d{2}", start).group(0)
    # print(time, ":", hour["tweet_count"])
    output[date]["hourly"].update({time : hour["tweet_count"]})

output

{'2023-03-18': {'total': 91842,
  'hourly': {'00:00:00': 4450,
   '01:00:00': 4059,
   '02:00:00': 4065,
   '03:00:00': 3646,
   '04:00:00': 3285,
   '05:00:00': 2659,
   '06:00:00': 2497,
   '07:00:00': 2584,
   '08:00:00': 2556,
   '09:00:00': 2567,
   '10:00:00': 2834,
   '11:00:00': 3494,
   '12:00:00': 4371,
   '13:00:00': 4543,
   '14:00:00': 5138,
   '15:00:00': 5278,
   '16:00:00': 5338,
   '17:00:00': 4700,
   '18:00:00': 4645,
   '19:00:00': 4307,
   '20:00:00': 4035,
   '21:00:00': 3728,
   '22:00:00': 3682,
   '23:00:00': 3380}},
 '2023-03-19': {'total': 78682,
  'hourly': {'00:00:00': 3366,
   '01:00:00': 3182,
   '02:00:00': 3078,
   '03:00:00': 2816,
   '04:00:00': 2464,
   '05:00:00': 2189,
   '06:00:00': 1964,
   '07:00:00': 1993,
   '08:00:00': 2144,
   '09:00:00': 2148,
   '10:00:00': 2291,
   '11:00:00': 2614,
   '12:00:00': 3287,
   '13:00:00': 3804,
   '14:00:00': 4248,
   '15:00:00': 4478,
   '16:00:00': 4353,
   '17:00:00': 4193,
   '18:00:00': 4094,
   '19:00:0

### Output results to file

In [None]:
filename_meta = "meta.json"
path = "/home/p11333at/nlp-project/data/"

if os.path.exists(f"{path}{filename_meta}"):
    with open(f"{path}{filename_meta}", "r+") as f:
        meta = json.load(f)
        meta.update(output)

        f.seek(0)
        json.dump(meta, f, indent=2)
        f.truncate()
else:
    with open(f"{path}{filename_meta}", "w") as f:
        json.dump(output, f, indent=2)

print(f"The tweet counts is recorded in '{filename_meta}' file.")

THE END