# Notebook for collecting tweets from Danish Parliament members usernames
### Anton Elias Holt - exam in Computational linguistics at Aarhus University, spring 2021

In [2]:
import pandas as pd
import tweepy
import pickle
import langdetect
from datetime import datetime, timedelta

In [3]:
"""Getting the list of parliament-members and their Twitter-names"""

data = pd.read_pickle('ft_medlemmer_v3.pkl')
data

Unnamed: 0,Medlem,Parti,Twitter_navn
0,Mette Abildgaard,KF,metteabildgaard
1,Karina Adsbøl,DF,AdsbolAdsbl
2,Tommy Ahlers,V,aahlers
3,Alex Ahrendtsen,DF,
4,Marlene Ambo-Rasmussen,V,MarleneAmbo
...,...,...,...
174,Lea Wermelin,S,LeaWermelin
175,Susanne Zimmer,UFG,Susanne_Zimmer_
176,Fatma Øktem,V,fatmaoektem
177,Orla Østerby,UFG,orlaosterby


In [4]:
"""Some parliament members who have a Twitter-profile are inactive, these are removed in ft_medllemer_v3.pkl
    This holds true for:"""
#Jan Johansen (janjohansen16)
#data['Twitter_navn'][77] = 'NaN'

#Bjarne Laustsen (Bjarne_Laustsen)
#data['Twitter_navn'][101] = 'NaN'

#Hans Christian Schmidt (HaChrSchmidt) Han har 2 tweets, men den nyeste er fra marts 2019
#data['Twitter_navn'][146] = 'NaN'

#Sjúrður Skaale (SjurSkaale) Han har 1 tweet, der er et retweet, fra oktober 2019
#data['Twitter_navn'][151] = 'NaN'
data['Twitter_navn'][151]

'NaN'

In [5]:
"""Authorization for tweepy api"""

# Twitter Api Credentials
consumer_key = 
consumer_secret = 
access_token = 
access_token_secret = 

auth = tweepy.OAuthHandler(consumer_key, consumer_secret) #Interacting with twitter's API
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API (auth) #creating the API object

In [1]:
"""Defines the functions for extracting data from a Twitter user's profile"""

def get_tweets(username, max_ID):
    tweets = api.user_timeline(screen_name=username, 
                               # 200 is the maximum allowed count
                               count=200,
                               include_rts = False,
                               # Necessary to keep full_text 
                               # otherwise only the first 140 words are extracted
                               tweet_mode = 'extended',
                               max_id = max_ID,
                               exclude_replies = True)
    return tweets

def get_tweets_start(username):
    tweets = api.user_timeline(screen_name=username, 
                               # 200 is the maximum allowed count
                               count=200,
                               include_rts = False,
                               # Necessary to keep full_text 
                               # otherwise only the first 140 words are extracted
                               tweet_mode = 'extended',
                               exclude_replies = True)
    return tweets

# The function below stops collecting when a specific date is reached
## It also filters out english tweets with the langdetect module

In [7]:
"""This function extracts all tweets of the Timeline of a Twitter-user.
    The function returns: 
        - A list of all original tweets the user have posted
        - The date and time of their earliest tweet
        - The number of tweets extracted through the function
    It also prints this information to your selected output.
    The function utilizes both the get_tweets() function and the get_tweets_start() function.
"""

def get_ALL_tweets(username):
    run = 1
    list_of_tweets = []
    list_of_ids = []
    starting_list = get_tweets_start(username)
    for tweet in starting_list:
        list_of_ids.append(tweet.id)
        list_of_tweets.append(tweet)
        """Breaking the loop if the tweets are from earlier than a specific date"""
        if datetime.date(list_of_tweets[-1].created_at) < datetime.date(datetime(2019,1,1)):
            run = 0
            print('past the date!')
            break
    
    while run == 1:
        running_list = get_tweets(username,min(list_of_ids))
        for tweet in running_list:
            list_of_ids.append(tweet.id)
            list_of_tweets.append(tweet)
            if tweet.id == list_of_ids[-2] and tweet.id == list_of_ids[-4]:
                run = 0
        
            """Breaking the loop if the tweets are from earlier than a specific date"""
            if datetime.date(list_of_tweets[-1].created_at) < datetime.date(datetime(2019,1,1)):
                run = 0
                print('past the date!')
                break
            
    # After extracting the tweets, duplicates (that are produced at the start of each iteration) are removed
    new_list_of_ids = []
    new_list_of_tweets = []
    for tweet in list_of_tweets:
        if tweet.id not in new_list_of_ids:
            new_list_of_ids.append(tweet.id)
            new_list_of_tweets.append(tweet)
    print('Collected tweets from: ',username)
    print('Tweets collected: ',len(new_list_of_tweets))
    print('Earliest tweet was from: ',new_list_of_tweets[-1].created_at,'\n')
    return new_list_of_tweets, new_list_of_tweets[-1].created_at, len(new_list_of_tweets)

In [None]:
"""Testing the function on a single twitter-name"""

mylist = get_ALL_tweets(data['Twitter_navn'][4])[0]
for tweet in mylist:
    print(tweet.full_text,tweet.created_at,'\n')

In [None]:
"""Testing the function on 3 twitter-names"""

list_of_people = []
for i in range(3):
    list_of_people.append(data['Twitter_navn'][i])
list_of_people

In [8]:
"""Creating new empty columns for the list of all tweets, the date of the 
    earliest tweet, and the total number of tweets"""

list_for_new_columns = []
for i in range(0,179,1):
    list_for_new_columns.append('NaN')

data['ALL_tweets'],data['First_tweet'],data['N_tweets'] = list_for_new_columns, list_for_new_columns, list_for_new_columns
data

Unnamed: 0,Medlem,Parti,Twitter_navn,ALL_tweets,First_tweet,N_tweets
0,Mette Abildgaard,KF,metteabildgaard,,,
1,Karina Adsbøl,DF,AdsbolAdsbl,,,
2,Tommy Ahlers,V,aahlers,,,
3,Alex Ahrendtsen,DF,,,,
4,Marlene Ambo-Rasmussen,V,MarleneAmbo,,,
...,...,...,...,...,...,...
174,Lea Wermelin,S,LeaWermelin,,,
175,Susanne Zimmer,UFG,Susanne_Zimmer_,,,
176,Fatma Øktem,V,fatmaoektem,,,
177,Orla Østerby,UFG,orlaosterby,,,


In [9]:
"""Here I actually run the function for collecting the tweets from 
    a users timeline on the list of Danish parliament members"""
n=0
k=1
while k == 1:
    for i in range(n,180,1):
        if i == 179:
            k=0
            break
        print(i,data['Medlem'][i])
        if data['Twitter_navn'][i] != 'NaN':
                try:
                    data['ALL_tweets'][i],data['First_tweet'][i],data['N_tweets'][i] = get_ALL_tweets(data['Twitter_navn'][i])
                except:
                    if i != 0:
                        n=i-1
                        print('ERROR\n')
                    if i == 178:
                        print('DONE\n')
                        k=0
                    break
        else:
            print('NaN\n')

0 Mette Abildgaard
past the date!
Collected tweets from:  metteabildgaard
Tweets collected:  375
Earliest tweet was from:  2018-12-31 22:06:15 

1 Karina Adsbøl
past the date!
Collected tweets from:  AdsbolAdsbl
Tweets collected:  430
Earliest tweet was from:  2018-12-31 17:06:06 

2 Tommy Ahlers
past the date!
Collected tweets from:  aahlers
Tweets collected:  100
Earliest tweet was from:  2018-12-20 15:04:20 

3 Alex Ahrendtsen
NaN

4 Marlene Ambo-Rasmussen
past the date!
Collected tweets from:  MarleneAmbo
Tweets collected:  141
Earliest tweet was from:  2015-06-17 09:53:44 

5 Katarina Ammitzbøll
past the date!
Collected tweets from:  Ammitzboell_K
Tweets collected:  789
Earliest tweet was from:  2018-12-27 07:38:47 

6 Simon Emil Ammitzbøll-Bille
Collected tweets from:  ammitzbollbille
Tweets collected:  589
Earliest tweet was from:  2020-03-28 11:40:36 

7 Hans Andersen
past the date!
Collected tweets from:  HansAndersenV
Tweets collected:  442
Earliest tweet was from:  2018-12-2

past the date!
Collected tweets from:  mariannejelved
Tweets collected:  1
Earliest tweet was from:  2018-01-18 22:11:50 

69 Jacob Jensen
past the date!
Collected tweets from:  jacobjensenMF
Tweets collected:  81
Earliest tweet was from:  2018-10-04 20:17:04 

70 Leif Lahn Jensen
past the date!
Collected tweets from:  LahnLeif
Tweets collected:  1
Earliest tweet was from:  2017-04-26 14:17:30 

71 Michael Aastrup Jensen
past the date!
Collected tweets from:  michaelaastrup
Tweets collected:  168
Earliest tweet was from:  2018-12-23 13:03:43 

72 Mogens Jensen
past the date!
Collected tweets from:  MogensJensenS
Tweets collected:  625
Earliest tweet was from:  2018-12-24 18:33:50 

73 Thomas Jensen
NaN

74 Brigitte Klintskov Jerkel
past the date!
Collected tweets from:  JerkelK
Tweets collected:  306
Earliest tweet was from:  2018-12-31 18:25:51 

75 Jens Joel
past the date!
Collected tweets from:  Jens_Joel
Tweets collected:  423
Earliest tweet was from:  2018-12-19 20:27:44 

76 Edmu

past the date!
Collected tweets from:  Jesper_Pet
Tweets collected:  422
Earliest tweet was from:  2018-12-18 11:42:12 

132 Rasmus Helveg Petersen
past the date!
Collected tweets from:  rasmushelveg
Tweets collected:  28
Earliest tweet was from:  2018-11-19 11:47:34 

133 Søren Pape Poulsen
past the date!
Collected tweets from:  SorenPape
Tweets collected:  298
Earliest tweet was from:  2018-12-31 14:14:30 

134 Troels Lund Poulsen
past the date!
Collected tweets from:  troelslundp
Tweets collected:  117
Earliest tweet was from:  2018-12-18 10:17:32 

135 Rasmus Prehn
past the date!
Collected tweets from:  RasmusPrehn
Tweets collected:  719
Earliest tweet was from:  2018-12-21 20:15:32 

136 Lars Aslan Rasmussen
past the date!
Collected tweets from:  lars_aslan
Tweets collected:  375
Earliest tweet was from:  2018-12-25 09:09:12 

137 Lars Løkke Rasmussen
past the date!
Collected tweets from:  larsloekke
Tweets collected:  418
Earliest tweet was from:  2018-12-30 23:05:35 

138 Søren 

In [10]:
"""Counting up the total number of tweets extracted in the process"""

n = 0
for i in data['N_tweets']:
    if i != 'NaN':
        n+=i
print(n) #103.501 pr. 21/4 2021 kl. 16:55

52427


In [None]:
"""For reference, each tweet has a number of meta-data assigned to it. 
    We can find them with the dir() function.
    
    The ones that are most important are:
    - tweet.full_text // returns the text of the tweet, this is what you see on twitter
    - tweet.created_at // returns the time that the tweet was posted
    - tweet.id // returns the ID of the tweet
    - tweet.author // returns the name of the author of the tweet
    - tweet.retweet_count // returns the amounts of retweet the tweet has gotten
    - tweet.favorite_count // returns the amounts of people who has favorited the tweet
    - tweet.geo // returns the geographic location of the author. It isn't obligatory and most tweets returns: None
    """

## Saving the DataFrame using pickle

In [None]:
data.head()

In [11]:
data.to_pickle('ALL_tweets_24-05.pkl')

# The code below was used for testing the date_time function and figuring out how to compare dates

In [None]:
from datetime import datetime, timedelta

In [None]:
now = datetime.date(datetime.now())
now

In [None]:
past = datetime.date(data['ALL_tweets'][0][500].created_at)
past

In [None]:
now2 = datetime.date(datetime(2018,5,18))
now2

In [None]:
now2 > past