In [3]:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
from twitter import *
from twitter.stream import TwitterStream, Timeout, HeartbeatTimeout, Hangup

### code adapted from: https://github.com/computermacgyver/twitter-python/blob/master/streaming_simple.py

class StdOutListener(StreamListener):

    # This function gets called every time a new tweet is received on the stream
    def on_data(self, data):

        if data is None:
            print("-- None --")

        elif data is Timeout:
            print("-- Timeout --")

        elif data is HeartbeatTimeout:
            print("-- Heartbeat Timeout --")

        elif data is Hangup:
            print("-- Hangup --")

        else:
            # Just write data to one line in the file
            fhOut.write(data)

    def on_error(self, status):
        print("ERROR")
        print(status)


if __name__ == '__main__':
    try:
        # tokens
        consumer_key = ''
        consumer_secret = ''
        resource_owner_key = ''
        resource_owner_secret = ''
        
        # ** select appropriate output file **
        fhOut = open("sample-1.json", "a")
        #fhOut = open("sample-2.json", "a")

        # Create the listener
        l = StdOutListener()
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(resource_owner_key, resource_owner_secret)

        # Connect to the Twitter stream
        stream = Stream(auth, l)

        # Filter terms and locations  ** select appropriate stream for samples **
        stream.filter(track=["#covid19", "#covid-19", "#coronavirus"])  # sample1
        #stream.filter(track=["#covid19", "#covid-19", "#coronavirus"], locations=[5.9559113, 45.817994999999996, 10.4922941, 47.8084648])  # sample 2


    except KeyboardInterrupt:
        # Ctrl + C to stop process
        # Approximately 3000 records are obtained after running for 1 minute.
        pass

    # Close the json file and manually import file to MongoDB
    fhOut.close()


In [None]:
### These pymongo queries were executed in PyCharm connected to a local Mongo database ###

## Info on samples:
# Samples were collected by running the script above for one minute (approx. 3000 tweets collected)
# sample1: n = 3161, collected on 23/3
# sample2: n = 3152, collected on 24/3

In [None]:
## // Q4 - SAMPLE 1, n=3161 //

# find records with/without URLS
db.a2_sample1.find({ "entities.urls" : { "$exists" : true, $not: {$size: 0} }}) #WITH (627)

# find records containing retweets
db.a2_sample1.find({ "retweeted_status.id" : { "$exists" : true }  })  #CONTAINS RTs (2426)

# language distribution of tweets
db.a2_sample1.aggregate([
    { $group : { _id : "$lang",
            count: { $sum: 1 }
    }},
    {$sort : { count : -1 }}])

# most frequent hashtags
db.a2_sample1.aggregate([
    {$unwind : "$entities"},
    {$unwind : "$entities.hashtags"},
    {$unwind : "$entities.hashtags.text"}  #to unwind the hashtag text array to individual hashtags
    , {$group : {_id: "$entities.hashtags.text", count: { $sum: 1}}}
    , {$sort : { count : -1 }}
])

In [36]:
# import sys
# !{sys.executable} -m pip install tabulate

from IPython.display import HTML, display
import tabulate

In [38]:
## RESULTS & TABLES Q4 (SAMPLE 1) ##

print("\nResults for Q4 (Sample 1):")
print("\nPercentage of tweets that contain URLs = 627/3161 = 19.8%")
print("Percentage of tweets that contain/are retweets = 2426/3161 = 76.7%")

table_s1 = [["Language","Count","Percentage"],
["Language","Count","Percentage"],
["en",1741,55.11],
["es",472,14.94],
["fr",220,6.96],
["und",92,2.91],
["th",85,2.69],
["it",84,2.66],
["hi",81,2.56],
["",61,1.93],
["tr",48,1.52],
["in",46,1.46],
["pt",40,1.27],
["ca",39,1.23],
["ta",28,0.89],
["de",25,0.79],
["ar",15,0.47],
["nl",14,0.44],
["ur",10,0.32],
["ko",7,0.22],
["mr",7,0.22],
["tl",6,0.19],
["ja",5,0.16],
["ru",5,0.16],
["fa",5,0.16],
["zh",4,0.13],
["ro",3,0.09],
["et",3,0.09],
["fi",3,0.09],
["el",2,0.06],
["lv",2,0.06],
["gu",1,0.03],
["uk",1,0.03],
["ka",1,0.03],
["te",1,0.03],
["sv",1,0.03],
["lt",1,0.03],
["sl",1,0.03],
["ne",1,0.03]

]

print("\nDistribution of languages (Sample 1):")
display(HTML(tabulate.tabulate(table_s1, tablefmt='html')))

table_s1_2 = [["Rank","Hashtag","Frequency"],
         ["1","COVID19",679],
         ["2","coronavirus",456],
         ["3","Coronavirus",204],
         ["4","Covid19",71],
         ["5","COVIDー19",69],
         ["6","covid19",45],
         ["7","CoronaVirus",40],
         ["8","โควิค19",20],
         ["9","กษลรจตกม",20],
         ["10","ÚLTIMAHORA",20],
         ["11","CORONAVIRUS",19],
         ["12","COVID",19],
         ["13","Italy",18],
         ["14","COVID19Nigeria",18],
         ["15","StayHome",18],
         ["16","COVIDIOTS",15],
         ["17","China",14],
         ["18","Italie",12],
         ["19","23Mar",12],
         ["20","LockdownNow",11]]

print("\nTop 20 hashtags' frequencies (Sample 2):")
display(HTML(tabulate.tabulate(table_s1_2, tablefmt='html')))


Results for Q4 (Sample 1):

Percentage of tweets that contain URLs = 627/3161 = 19.8%
Percentage of tweets that contain/are retweets = 2426/3161 = 76.7%

Distribution of languages (Sample 1):


0,1,2
Language,Count,Percentage
Language,Count,Percentage
en,1741,55.11
es,472,14.94
fr,220,6.96
und,92,2.91
th,85,2.69
it,84,2.66
hi,81,2.56
,61,1.93



Top 20 hashtags' frequencies (Sample 2):


0,1,2
Rank,Hashtag,Frequency
1,COVID19,679
2,coronavirus,456
3,Coronavirus,204
4,Covid19,71
5,COVIDー19,69
6,covid19,45
7,CoronaVirus,40
8,โควิค19,20
9,กษลรจตกม,20


In [None]:
## // Q4 - SAMPLE 2, n=3152 //

# find records with/without URLS
db.a2_sample2.find({ "entities.urls" : { "$exists" : true, $not: {$size: 0} }}) #WITH (560)

# find records containing retweets
db.a2_sample2.find({ "retweeted_status.id" : { "$exists" : true }  })  #CONTAINS RTs (2386)

# language distribution of tweets
db.a2_sample2.aggregate([
    { $group : { _id : "$lang",
            count: { $sum: 1 }
    }},
    {$sort : { count : -1 }}])

# most frequent hashtags
db.a2_sample2.aggregate([
    {$unwind : "$entities"},
    {$unwind : "$entities.hashtags"},
    {$unwind : "$entities.hashtags.text"}  #to unwind the hashtag text array to individual hashtags
    , {$group : {_id: "$entities.hashtags.text", count: { $sum: 1}}}
    , {$sort : { count : -1 }}
])

In [35]:
## RESULTS & TABLES Q4 (SAMPLE 2) ##

print("\nResults for Q4 (Sample 2):")
print("\nPercentage of tweets that contain URLs = 560/3152 = 17.8%")
print("Percentage of tweets that contain/are retweets = 2386/3152 = 75.7%")

table_s2 = [["Language","Count","Percentage"],
["en",1725,54.73],
["es",413,13.1],
["th",205,6.5],
["fr",192,6.09],
["und",93,2.95],
["hi",70,2.22],
["in",64,2.03],
["NULL",60,1.9],
["it",54,1.71],
["tr",52,1.65],
["de",31,0.98],
["ca",30,0.95],
["pt",25,0.79],
["ta",23,0.73],
["ar",22,0.7],
["nl",13,0.41],
["ja",9,0.29],
["ur",9,0.29],
["ru",8,0.25],
["tl",7,0.22],
["ht",6,0.19],
["et",6,0.19],
["te",5,0.16],
["mr",5,0.16],
["pl",4,0.13],
["fa",4,0.13],
["sv",3,0.1],
["el",3,0.1],
["zh",3,0.1],
["eu",2,0.06],
["ro",2,0.06],
["ko",1,0.03],
["bn",1,0.03],
["da",1,0.03],
["vi",1,0.03]
]

print("\nDistribution of languages (Sample 2):")
display(HTML(tabulate.tabulate(table_s2, tablefmt='html')))

table_s2_2 = [["Rank","Hashtag","Frequency"],
         ["1","COVID19",560],
         ["2","coronavirus",409],
         ["3","Coronavirus",174],
         ["4","Covid19",82],
         ["5","CoronaVirus",66],
         ["6","covid19",60],
         ["7","COVIDー19",46],
         ["8","China",30],
         ["9","IndiaFightsCorona",28],
         ["10","StayAtHomeSaveLives",25],
         ["11","StayAtHome",18],
         ["12","Corona",17],
         ["13","COVID",16],
         ["14","CoronavirusLockdown",16],
         ["15","Hantavirus",16],
         ["16","CORONAVIRUS",15],
         ["17","NYC",14],
         ["18","lockdown",13],
         ["19","Covid",13],
         ["20","PMModi",12]]

print("\nTop 20 hashtags' frequencies (Sample 2):")
display(HTML(tabulate.tabulate(table_s2_2, tablefmt='html')))


Results for Q4 (Sample 2):

Percentage of tweets that contain URLs = 560/3152 = 17.8%
Percentage of tweets that contain/are retweets = 2386/3152 = 75.7%

Distribution of languages (Sample 2):


0,1,2
Language,Count,Percentage
en,1725,54.73
es,413,13.1
th,205,6.5
fr,192,6.09
und,93,2.95
hi,70,2.22
in,64,2.03
,60,1.9
it,54,1.71



Top 20 hashtags' frequencies (Sample 2):


0,1,2
Rank,Hashtag,Frequency
1,COVID19,560
2,coronavirus,409
3,Coronavirus,174
4,Covid19,82
5,CoronaVirus,66
6,covid19,60
7,COVIDー19,46
8,China,30
9,IndiaFightsCorona,28


In [None]:
## // Q5 - using sample 1, n=3161 //

# tweets directly generated by all the 20 media accounts, n=0
db.a2_sample1.find({ "user.screen_name":
        { $in: ["ChannelNewsAsia",
            "BBCNews",
            "AJENews",
            "SCMPNews",
            "nytimes",
            "AFP",
            "CNN",
            "STcom",
            "SkyNews",
            "washingtonpost",
            "BNODesk",
            "FoxNews",
            "BBCWorld",
            "YonhapNews",
            "XHNews",
            "business",
            "CNBC",
            "CBCAlerts",
            "guardian",
            "LeTemps",]}
})

# tweets directly generated by all the 20 NGO/gov. accounts, n=0
db.a2_sample1.find({ "user.screen_name":
        { $in: ["WHO",
            "BAG_OFSP_UFSP",
            "sporeMOH",
            "realDonaldTrump",
            "BorisJohnson",
            "JustinTrudeau",
            "SecPompeo",
            "alain_berset",
            "EmmanuelMacron",
            "MSF",
            "NHSuk",
            "DrTedros",
            "SGPMissionGva",
            "POTUS",
            "Mike_Pence",
            "EU_Commission",
            "EUCouncil",
            "Europarl_EN",
            "Davos",
            "UN",]}
})


# tweets by all the 20 media accounts appearing as retweets, n=13
db.a2_sample1.find({ "retweeted_status.user.screen_name":
        { $in: ["ChannelNewsAsia",
            "BBCNews",
            "AJENews",
            "SCMPNews",
            "nytimes",
            "AFP",
            "CNN",
            "STcom",
            "SkyNews",
            "washingtonpost",
            "BNODesk",
            "FoxNews",
            "BBCWorld",
            "YonhapNews",
            "XHNews",
            "business",
            "CNBC",
            "CBCAlerts",
            "guardian",
            "LeTemps",]}
})

# tweets by all the 20 NGO/gov. accounts appearing as retweets, n=27
db.a2_sample1.find({ "retweeted_status.user.screen_name":
        { $in: ["WHO",
            "BAG_OFSP_UFSP",
            "sporeMOH",
            "realDonaldTrump",
            "BorisJohnson",
            "JustinTrudeau",
            "SecPompeo",
            "alain_berset",
            "EmmanuelMacron",
            "MSF",
            "NHSuk",
            "DrTedros",
            "SGPMissionGva",
            "POTUS",
            "Mike_Pence",
            "EU_Commission",
            "EUCouncil",
            "Europarl_EN",
            "Davos",
            "UN",]}
})

In [39]:
## RESULTS Q5 (SAMPLE 1 only) ##

print("\nPercentage of tweets directly generated by all the 20 media accounts: 0%")
print("Percentage of tweets directly generated by all the 20 NGO/gov. accounts: 0%")
print("Percentage of tweets generated by all the 20 media accounts that appear as retweets: 13/3161 = 0.41%")
print("Percentage of tweets generated by all the 20 NGO/gov. accounts that appear as retweets: 27/3161 = 0.85%")


Percentage of tweets directly generated by all the 20 media accounts: 0%
Percentage of tweets directly generated by all the 20 NGO/gov. accounts: 0%
Percentage of tweets generated by all the 20 media accounts that appear as retweets: 13/3161 = 0.41%
Percentage of tweets generated by all the 20 NGO/gov. accounts that appear as retweets: 27/3161 = 0.85%


In [42]:
## Q6 - Short Discussion ##

print("I did not find the percentage of tweets that are/contain retweets surprising as most users are likely to share information/opinions based on the tweets of the people they follow. However, I found that the percentage of tweets containing URLs to be surprisingly low. Before I computed the results, I was expecting many more tweets to contain URLs given the likelihood of users sharing links to news articles on the Covid-19 situation.")
print("\nOne thing that was surprising to me was that for Sample 2, the distribution of languages declared in the tweets were quite similar to that of Sample 1 (more than 50% EN). I was expecting DE and FR tweets to more prevalent in Sample 2 given the bounding box over Switzerland and bits of France/Germany.")
print("\nThe top hashtags were mostly expected; especially those manifesting as some form of 'coronavirus/covid' since this is the topic of discussion we are tracking in this assignment. Moreover, the majority of the top hashtags are the most straightforward and universally-understood words for tagging the tweets across all languages.")
print("\nFinally, I was surprised that the percentages computed for Q5 were almost 0. Yet again, the collected tweets were merely the ~3000 grabbed at any one point in time. Thus, it is probable that none of the top 'tweeters' for this Covid-19 topic were reflected in my sample which makes up a miniscule proportion of all relevant tweets.")

I did not find the percentage of tweets that are/contain retweets surprising as most users are likely to share information/opinions based on the tweets of the people they follow. However, I found that the percentage of tweets containing URLs to be surprisingly low. Before I computed the results, I was expecting many more tweets to contain URLs given the likelihood of users sharing links to news articles on the Covid-19 situation.

One thing that was surprising to me was that for Sample 2, the distribution of languages declared in the tweets were quite similar to that of Sample 1 (more than 50% EN). I was expecting DE and FR tweets to more prevalent in Sample 2 given the bounding box over Switzerland and bits of France/Germany.

The top hashtags were mostly expected; especially those manifesting as some form of 'coronavirus/covid' since this is the topic of discussion we are tracking in this assignment. Moreover, the majority of the top hashtags are the most straightforward and universa