# Collecting Tweets 
Using snc-scraper we were able to gather multiple ids for tweets that are from August 2017 to August 2020, however since these are only ids we are interested in the actual tweets and saving them in files to process and analyze. To do so we use the Twitter API to extract the tweets with the given ids and save them in a csv file to use later. 
The tweets are gathered in chunck of hundred as the Twitter API raises an error if their are more than 100 ids that it has to process at the same time. 

In [1]:
def gatherIDs(fileName):
    with open(fileName, "r") as f:
        idList = [line.split("/")[-1].strip() for line in f]
        return idList

In [25]:
import datetime
import pandas as pd
import math

In [8]:
import tweepy, json

consumer_key = "" # fill in with your API information 
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [64]:
def get_tweets_100(tweet_ids):     
    """
    extracts 100 tweets with given tweet ids
    """
    statuses = api.statuses_lookup(tweet_ids, tweet_mode="extended")
    data = pd.DataFrame(columns = ["tweet_id","name","screen_name","acctdesc","text", "usercreated", "created_at","favourite_count","retweet_count",
                  "hashtags","status_count", "followers_count","following_count", "location", "source_device"]) # define your own dataframe
    #print(data)
    # printing the statuses
    for status in statuses:
        # print(status.lang)
        
        if status.lang == "en":
            mined = {
                "tweet_id": status.id,
                "name": status.user.name,
                "screen_name": status.user.screen_name,
                "acctdesc" : status.user.description,
                "text": status.full_text,
                "usercreated": status.user.created_at,
                "created_at": status.created_at,
                "favourite_count": status.favorite_count,
                "retweet_count": status.retweet_count,
                "hashtags": status.entities["hashtags"],
                "status_count": status.user.statuses_count,
                "followers_count": status.user.followers_count,
                "following_count": status.user.friends_count,
                "location": status.place,
                "source_device": status.source
            }

            last_tweet_id = status.id
            data = data.append(mined, ignore_index=True)
            #print(data)
    return data

In [27]:
def chuncks(id_list, n):
    """
    helper function to break down the ids into chuncks of size n to avoid calling get_tweets_100 with too many ids
    """
    # n is the number of elements in each list part
    return [id_list[i * n:(i + 1) * n] for i in range((len(id_list) + n - 1) // n )]  

In [68]:
def get_all_tweets(tweet_ids, tweetName):
    dataOverall = pd.DataFrame(columns = ["tweet_id","name","screen_name","acctdesc","text", "usercreated", "created_at","favourite_count","retweet_count",
                  "hashtags","status_count", "followers_count", "following_count", "location", "source_device"])
    numTot = len(tweet_ids)
    if numTot > 100: 
        newList = chuncks(tweet_ids, 100)
        for i in range(len(newList)):
            data = get_tweets_100(newList[i])
            #print(data)
            dataOverall = dataOverall.append(data)
            #print(dataOverall)
    else:
        dataOverall = get_tweets_100(tweet_ids)
    
    dataOverall.to_csv(
        f"tweets_"+tweetName+".csv",header = ["tweet_id","name","screen_name","acctdesc","text","usercreated",
                                                        "created_at","favourite_count", "retweet_count","hashtags","status_count", 
                                                        "followers_count", "following_count", "location", "source_device"], index=False
    )

In [65]:
ids = gatherIDs("Kill_Oct_tweets.txt")
len(ids)
get_all_tweets(ids, "autismtest")

In [73]:
killIds = gatherIDs("merged-kill.txt")
poisonIds = gatherIDs("merged-poison.txt")
autismIds = gatherIDs("merged-autism.txt")
injuredIds = gatherIDs("merged-injured.txt")

In [83]:
len(killIds)
#len(injuredIds)
len(autismIds)

596

In [77]:
get_all_tweets(killIds, "vaccineskill")

In [78]:
get_all_tweets(poisonIds, "vaccinesarepoison")

In [79]:
get_all_tweets(autismIds, "vaccinescauseautism")

In [80]:
get_all_tweets(injuredIds, "vaccineinjured")

In [84]:
killIds2018 = gatherIDs("Kill_2018_tweets.txt")
poisonIds2018 = gatherIDs("Poison_2018_tweets.txt")
autismIds2018 = gatherIDs("Autism_2018_tweets.txt")
injuredIds2018 = gatherIDs("Injured_2018_all.txt")

In [85]:
get_all_tweets(killIds2018, "vaccineskill2018")
get_all_tweets(poisonIds2018, "poison2018")
get_all_tweets(autismIds2018, "autism2018")
get_all_tweets(injuredIds2018, "vaccineinjured2018")

In [86]:
killIds2017 = gatherIDs("Kill_2017_tweets.txt")
poisonIds2017 = gatherIDs("Poison_2017_tweets.txt")
autismIds2017 = gatherIDs("Autism_2017_tweets.txt")
injuredIds2017 = gatherIDs("Injured2017.txt")

In [87]:
get_all_tweets(killIds2017, "vaccineskill2017")
get_all_tweets(poisonIds2017, "poison2017")
get_all_tweets(autismIds2017, "autism2017")
get_all_tweets(injuredIds2017, "vaccineinjured2017")

In [89]:
# getting #vaxxed tweets
vaxxedIds2017 = gatherIDs("Vaxxed2017.txt")
get_all_tweets(vaxxedIds2017, "vaxxed2017")

In [91]:
vaxxedIds2018 = gatherIDs("Vaxxed2018.txt")
get_all_tweets(vaxxedIds2018, "vaxxed2018")

In [92]:
vaxxedIds = gatherIDs("Vaxxed2019.txt")
get_all_tweets(vaxxedIds, "vaxxed2019")