# Retrieving and analysing Twitter data

In [79]:
from collections import Counter

In [1]:
!pip3 install minet



In [90]:
!pip3 install ural



## Collecting tweets

In [3]:
from minet.twitter import TwitterAPIScraper

In [4]:
scraper = TwitterAPIScraper()

In [72]:
TWEETS = []

for tweet in scraper.search_tweets('holiday lang:en filter:links -filter:media', limit=1000):
    TWEETS.append(tweet)
    
len(TWEETS)

1000

In [77]:
for k, v in TWEETS[1].items():
    print(k, '=', v)

id = 1517416162333757442
local_time = 2022-04-22T08:12:59
timestamp_utc = 1650615179
text = Travel always makes us #smile and @HeathrowExpress just gave us another opportunity to put on our happy face... courtesy of @WavemakerUK.
#Travel
#holidays 
#DOOH
https://mediashotz.co.uk/inflation-a-barrier-to-sustainable-food-shopping-quantilope/
url = https://twitter.com/mediashotz/status/1517416162333757442
quoted_id = None
quoted_user = None
quoted_user_id = None
quoted_timestamp_utc = None
retweeted_id = None
retweeted_user = None
retweeted_user_id = None
retweeted_timestamp_utc = None
media_files = []
media_types = []
media_urls = []
links = ['https://mediashotz.co.uk/inflation-a-barrier-to-sustainable-food-shopping-quantilope/']
links_to_resolve = True
domains = ['mediashotz.co.uk']
hashtags = ['dooh', 'holidays', 'smile', 'travel']
mentioned_ids = ['20240678', '902544740058628098']
mentioned_names = ['heathrowexpress', 'wavemakeruk']
collection_time = 2022-04-22T10:14:03.679450
match_qu

## Analysing the set of tweeted urls

In [99]:
urls = Counter()

for tweet in TWEETS:
    for link in tweet['links']:
        urls[link] += 1
        
urls.most_common(10)

[('http://www.laburnumlodge.com', 26),
 ('https://tvblackbox.com.au/page/2022/04/22/explore-the-destinations-of-the-super-wealthy-on-worlds-most-luxurious-holidays/',
  6),
 ('http://cutt.ly/OeQ4V5W', 5),
 ('https://www.telegraph.co.uk/news/2022/04/21/summer-holidays-threat-british-airways-cancels-popular-routes/',
  4),
 ('https://portobellobookblog.com/2022/04/22/my-holiday-reading-plans/', 3),
 ('http://organicholidays.com/at/1792.htm', 3),
 ('https://distrokid.com/hyperfollow/henryadams1/holiday', 2),
 ('https://www.independentcottages.co.uk/4071', 2),
 ('https://twitter.com/i/events/1517071964015370241', 2),
 ('https://www.telegraph.co.uk/business/2022/04/22/hard-forgive-emmanuel-macron/',
  2)]

In [100]:
len(urls)

850

In [91]:
from ural import get_domain_name, normalize_url, is_shortened_url

In [95]:
normalize_url('http://www.lemonde.fr'), normalize_url('https://lemonde.fr')

('lemonde.fr', 'lemonde.fr')

*Goals*

1. Let's redo the stats with normalized urls to see what changes in the top
2. Do those stats directly on the domain names
3. Compute the proportion of shortened urls

In [109]:
normalized_urls = Counter()

for tweet in TWEETS:
    for link in tweet['links']:
        normalized_urls[normalize_url(link)] += 1

normalized_urls.most_common(10)

[('laburnumlodge.com', 26),
 ('tvblackbox.com.au/page/2022/04/22/explore-the-destinations-of-the-super-wealthy-on-worlds-most-luxurious-holidays',
  6),
 ('telegraph.co.uk/news/2022/04/21/summer-holidays-threat-british-airways-cancels-popular-routes',
  5),
 ('cutt.ly/OeQ4V5W', 5),
 ('portobellobookblog.com/2022/04/22/my-holiday-reading-plans', 4),
 ('organicholidays.com/at/1792.htm', 3),
 ('thesun.co.uk/travel/18336898/summer-holidays-risk-british-airways-cancel',
  3),
 ('distrokid.com/hyperfollow/henryadams1/holiday', 2),
 ('independentcottages.co.uk/4071', 2),
 ('twitter.com/i/events/1517071964015370241', 2)]

In [102]:
len(normalized_urls)

845

In [107]:
domains = Counter()

for tweet in TWEETS:
    for link in tweet['links']:
        domains[get_domain_name(link)] += 1
        
domains.most_common(10)

[('alpes-holidays.com', 100),
 ('ebay.com', 45),
 ('bit.ly', 35),
 ('instagram.com', 34),
 ('laburnumlodge.com', 26),
 ('etsy.me', 22),
 ('twitter.com', 22),
 ('dlvr.it', 21),
 ('ift.tt', 19),
 ('visitingeu.com', 14)]

In [111]:
total_urls_count = 0
shortened_urls_count = 0

for tweet in TWEETS:
    for link in tweet['links']:
        total_urls_count += 1
        
        if is_shortened_url(link):
            shortened_urls_count += 1
            
total_urls_count, shortened_urls_count, shortened_urls_count / total_urls_count

(919, 124, 0.13492927094668117)