# Scraping tweets

## Installing minet

https://github.com/medialab/minet

In [1]:
!pip install minet==1.0.0-a14



In [32]:
!pip install ural



In [34]:
import json
from minet.twitter import TwitterAPIScraper
from collections import Counter
from ural import normalize_url, get_domain_name, is_shortened_url

## Scraping the tweets

In [3]:
scraper = TwitterAPIScraper()

In [8]:
TWEETS = []

for tweet in scraper.search_tweets('#saccageparis filter:links', limit=1000):
    TWEETS.append(tweet)

In [9]:
len(TWEETS)

1000

In [12]:
# Backup tweets
with open('./tweets-backup.json', 'w', encoding='utf-8') as f:
    json.dump(TWEETS, f, ensure_ascii=False, indent=2)

## Computing stats about the retrieved urls

In [21]:
# Making sure all of our tweets contain at least a link
tweets_with_links = 0

for tweet in TWEETS:
    # if len(tweet['links']) > 0:
    if tweet['links']:
        tweets_with_links += 1
        
tweets_with_links

166

In [26]:
TWEETS_WITH_LINK = []

for tweet in TWEETS:
    if tweet['links']:
        TWEETS_WITH_LINK.append(tweet)

TWEETS_WITH_LINK = [tweet for tweet in TWEETS if tweet['links']]
len(TWEETS_WITH_LINK)

166

In [31]:
# Counting the distinct urls
distinct_urls = Counter()

for tweet in TWEETS_WITH_LINK:
    for link in tweet['links']:
        distinct_urls[link] += 1
        
distinct_urls.most_common(10)

[('https://www.causeur.fr/agenda-politique-zadisation-paris-anne-hidalgo-208808',
  7),
 ('https://l.bfmtv.com/6IL', 7),
 ('https://www.bfmtv.com/economie/consommation/paris-ikea-prevoit-de-relocaliser-son-magasin-de-la-madeleine-a-italie-deux_AD-202304040453.html',
  7),
 ('https://www.lefigaro.fr/conjoncture/paris-le-vote-sur-les-trottinettes-en-libre-service-a-coute-390-000-euros-20230406',
  6),
 ('https://l.leparisien.fr/nH0T', 6),
 ('https://www.bfmtv.com/paris/vote-pour-l-interdiction-des-trottinettes-a-paris-le-scrutin-a-coute-390-000-euros_AD-202304060331.html',
  5),
 ('https://l.leparisien.fr/bWaT', 4),
 ('https://l.leparisien.fr/A1nT', 3),
 ('https://www.bfmtv.com/paris/paris-des-jeunes-migrants-occupent-une-ecole-desaffectee-du-16e-arrondissement_AD-202304040792.html',
  3),
 ('https://www.lefigaro.fr/social/crise-des-poubelles-a-paris-la-cgt-a-depose-un-nouveau-preavis-de-greve-20230404',
  3)]

## Processing & cleaning the URLs

In [36]:
get_domain_name('https://github.co.uk/medialab/minet#whirlwind-tour?key=1&key2=test')

'github.co.uk'

In [39]:
normalize_url('https://www.lemonde.fr?utm_campaign=4754')

'lemonde.fr'

In [41]:
is_shortened_url('bit.ly/utidtu'), is_shortened_url('lemonde.fr')

(True, False)

<u>3 goals:</u>

1. 10 most frequent normalized urls + comparing with non-normalized
2. 10 most frequent domain names
3. Ratio of shortened urls in the data set

In [47]:
# Counting the distinct normalized urls
distinct_normalized_urls = Counter()

for tweet in TWEETS_WITH_LINK:
    for link in tweet['links']:
        distinct_normalized_urls[normalize_url(link)] += 1
        
distinct_normalized_urls.most_common(10)

[('bfmtv.com/economie/consommation/paris-ikea-prevoit-de-relocaliser-son-magasin-de-la-madeleine-a-italie-deux_AD-202304040453.html',
  8),
 ('causeur.fr/agenda-politique-zadisation-paris-anne-hidalgo-208808', 7),
 ('l.bfmtv.com/6IL', 7),
 ('lefigaro.fr/conjoncture/paris-le-vote-sur-les-trottinettes-en-libre-service-a-coute-390-000-euros-20230406',
  6),
 ('l.leparisien.fr/nH0T', 6),
 ('bfmtv.com/paris/vote-pour-l-interdiction-des-trottinettes-a-paris-le-scrutin-a-coute-390-000-euros_AD-202304060331.html',
  5),
 ('l.leparisien.fr/bWaT', 4),
 ('l.leparisien.fr/A1nT', 3),
 ('bfmtv.com/paris/paris-des-jeunes-migrants-occupent-une-ecole-desaffectee-du-16e-arrondissement_AD-202304040792.html',
  3),
 ('lefigaro.fr/social/crise-des-poubelles-a-paris-la-cgt-a-depose-un-nouveau-preavis-de-greve-20230404',
  3)]

In [48]:
# Counting the distinct domain
distinct_domains = Counter()

for tweet in TWEETS_WITH_LINK:
    for link in tweet['links']:
        distinct_domains[get_domain_name(link)] += 1
        
distinct_domains.most_common(10)

[('leparisien.fr', 39),
 ('bfmtv.com', 39),
 ('lefigaro.fr', 16),
 ('flickr.com', 10),
 ('flip.it', 7),
 ('causeur.fr', 7),
 ('20minutes.fr', 5),
 ('twitter.com', 5),
 ('paris.fr', 4),
 ('lemonde.fr', 4)]

In [50]:
# Computing the ratio of shortened urls
short_count = 0
for url in distinct_urls:
    if is_shortened_url(url):
        short_count += 1
        
short_count / len(distinct_urls)

0.23931623931623933

## Resolving URLs

In [51]:
from minet.web import resolve

In [52]:
resolve('http://lemonde.fr')

[<Redirection type=location-header status=301 url=http://lemonde.fr>,
 <Redirection type=hit status=200 url=https://www.lemonde.fr/>]

In [54]:
resolve('https://l.leparisien.fr/bWaT')

[<Redirection type=location-header status=301 url=https://l.leparisien.fr/bWaT>,
 <Redirection type=location-header status=302 url=https://www.leparisien.fr/paris-75/trottinettes-en-libre-service-les-parisiens-disent-non-a-89-02-04-2023-G7H5VUV6PBDQLIS6RWEMQNSFN4.php#xtor=AD-1481423553>,
 <Redirection type=hit status=200 url=https://www.leparisien.fr/paris-75/les-parisiens-votent-la-fin-des-trottinettes-en-libre-service-succes-en-demi-teinte-pour-anne-hidalgo-02-04-2023-JCNI2MR36FDJZOCRKC43X4KQJM.php>]

In [56]:
resolve('https://l.leparisien.fr/bWaT')[-1].url

'https://www.leparisien.fr/paris-75/les-parisiens-votent-la-fin-des-trottinettes-en-libre-service-succes-en-demi-teinte-pour-anne-hidalgo-02-04-2023-JCNI2MR36FDJZOCRKC43X4KQJM.php'

In [55]:
is_shortened_url('https://l.leparisien.fr/bWaT')

True