# Notebook for preprocessing tweets and getting data for the paper
### Anton Elias Holt - exam in Computational linguistics at Aarhus University, spring 2021

In [20]:
import pickle
import pandas as pd
import demoji
import lemmy

from langdetect import detect

In [21]:
data = pd.read_pickle('ALL_tweets_24-05.pkl')
data

Unnamed: 0,Medlem,Parti,Twitter_navn,ALL_tweets,First_tweet,N_tweets
0,Mette Abildgaard,KF,metteabildgaard,[Status(_json={'created_at': 'Mon May 17 18:57...,2018-12-31 22:06:15,375
1,Karina Adsbøl,DF,AdsbolAdsbl,[Status(_json={'created_at': 'Sun May 16 20:44...,2018-12-31 17:06:06,430
2,Tommy Ahlers,V,aahlers,[Status(_json={'created_at': 'Fri May 07 09:04...,2018-12-20 15:04:20,100
3,Alex Ahrendtsen,DF,,,,
4,Marlene Ambo-Rasmussen,V,MarleneAmbo,[Status(_json={'created_at': 'Thu May 20 12:48...,2015-06-17 09:53:44,141
...,...,...,...,...,...,...
174,Lea Wermelin,S,LeaWermelin,[Status(_json={'created_at': 'Mon May 03 14:40...,2018-12-19 15:52:13,186
175,Susanne Zimmer,UFG,Susanne_Zimmer_,[Status(_json={'created_at': 'Sun May 23 05:54...,2019-08-26 19:08:10,1189
176,Fatma Øktem,V,fatmaoektem,[Status(_json={'created_at': 'Wed May 12 08:10...,2018-11-23 10:03:33,105
177,Orla Østerby,UFG,orlaosterby,[Status(_json={'created_at': 'Thu May 09 16:38...,2018-12-17 20:38:50,10


In [22]:
"""Counting the total number of tweets"""
n=0
for list_of_tweets in data['ALL_tweets']:
    if list_of_tweets != "NaN":
        for tweet in list_of_tweets:
            n+=1
print(n)

52427


In [19]:
"""Counting how many Parliament members has a twitter acc?"""
n=0
for tn in data['Twitter_navn']:
    if tn == 'NaN':
        n+=1
print(str(179-n)+'/179 members has a Twitter_account')
print(str(((179-n)/179)*100)[:4],'% has a Twitter account')

159/179 members has a Twitter_account
88.8 % has a Twitter account


In [34]:
"""Getting a list of all parties """

partier = set(data['Parti'])
partier

{'ALT',
 'DF',
 'EL',
 'IA',
 'JF',
 'KF',
 'LA',
 'NB',
 'RV',
 'S',
 'SF',
 'SIU',
 'SP',
 'UFG',
 'V'}

In [8]:
"""This function counts how many tweets are made by each party and creates a Dataframe of the number of tweets of each member
    j specifies the number for defining an active member"""
def find_members(parti,j):
    medlemmer = []
    n_tweets = []
    n_t = 0
    dic = {}
    n_gode = 0
    for i, par10 in enumerate(data['Parti']):
        if par10 == parti:
            medlemmer.append(data['Medlem'][i])
            n_tweets.append(data['N_tweets'][i])
            if data['N_tweets'][i] != 'NaN':
                n_t += data['N_tweets'][i]
                if data['N_tweets'][i] > j:
                    n_gode +=1
            else:
                data['N_tweets'][i] = 0
    dic['Medlem'] = medlemmer
    dic['N_tweets'] = n_tweets
    dic = pd.DataFrame(dic)
    
    
    
    """Print a return message"""
    print(parti, 'has written', n_t,' tweets')
    print(len(medlemmer),'members in the party')
    print(n_gode,'members with more than',j,'tweets')
    dic.sort_values('N_tweets',ascending = False)
    dic.reset_index()[['Medlem','N_tweets']]
    return dic

In [9]:
"""Testing the function on a single party"""
""" This cell has to be run multiple times for an unknown reason...
    Run it a few times and it will eventually work"""
dic= find_members('S',50)
dic

S has written 13640  tweets
49 members in the party
30 members with more than 50 tweets


Unnamed: 0,Medlem,N_tweets
0,Ida Auken,399
1,Kaare Dybvad Bek,0
2,Trine Bramsen,370
3,Bjørn Brandenborg,300
4,Jeppe Bruus,323
5,Morten Bødskov,958
6,Lennart Damsbo-Andersen,0
7,Benny Engelbrecht,1126
8,Camilla Fabricius,678
9,Mette Frederiksen,0


In [16]:
"""Running the function for all parties"""
""" This cell has to be run multiple times for an unknown reason...
    Run it a few times and it will eventually work"""

partier_i_folketinget = set(data['Parti'])
for p in partier_i_folketinget:
    find_members(p,50)
    print('\n')

JF has written 0  tweets
1 members in the party
0 members with more than 50 tweets


SF has written 4871  tweets
15 members in the party
12 members with more than 50 tweets


EL has written 5332  tweets
13 members in the party
12 members with more than 50 tweets


SP has written 0  tweets
1 members in the party
0 members with more than 50 tweets


S has written 13640  tweets
49 members in the party
30 members with more than 50 tweets


NB has written 656  tweets
4 members in the party
4 members with more than 50 tweets


ALT has written 262  tweets
1 members in the party
1 members with more than 50 tweets


SIU has written 32  tweets
1 members in the party
0 members with more than 50 tweets


IA has written 40  tweets
1 members in the party
0 members with more than 50 tweets


KF has written 3385  tweets
13 members in the party
10 members with more than 50 tweets


UFG has written 4128  tweets
8 members in the party
6 members with more than 50 tweets


DF has written 7598  tweets
16 me

In [17]:
"""Selecting the parties that I want to use in the further analysis"""
gode_partier = ['DF', 'EL', 'KF', 'RV', 'S', 'SF', 'V']

## Here i define the functions for preprocessing tweets

In [14]:
def clean(tweet):
    words = tweet.split(' ')
    new_text = ""
    for word in words:
        if "@" not in word and "#" not in word and "http" not in word:
            new_text += word.lower()+' '
    new_text = new_text.replace('\n',' ')
    no_emo = demoji.replace(new_text,'')
    
    while '  ' in no_emo:
        no_emo = no_emo.replace('  ',' ')
    clean = ""
    for char in no_emo:
        if char not in ",.-|'¨_!¤§&/()”:;%=?[’{}]+´*\"“0123456789":
            clean += char
    return clean


stop_ord = "ad af aldrig alene alle allerede alligevel alt altid anden andet andre at bag bare begge blandt blev blive bliver burde bør da de dem den denne dens der derefter deres derfor derfra deri dermed derpå derved det dette dig din dine disse dit dog du efter egen ej eller ellers en end endnu ene eneste enhver ens enten er et f.eks. far fem fik fire flere flest fleste for foran fordi forrige fra fx få får før først gennem gjorde gjort god godt gør gøre gørende ham han hans har havde have hej hel heller helt hen hende hendes henover her herefter heri hermed herpå hos hun hvad hvem hver hvilke hvilken hvilkes hvis hvor hvordan hvorefter hvorfor hvorfra hvorhen hvori hvorimod hvornår hvorved i igen igennem ikke imellem imens imod ind indtil ingen intet ja jeg jer jeres jo kan kom komme kommer kun kunne lad langs lav lave lavet lidt lige ligesom lille længere man mand mange med meget mellem men mens mere mest mig min mindre mindst mine mit mod må måske ned nej nemlig ni nogen nogensinde noget nogle nok nu ny nyt når nær næste næsten og også okay om omkring op os otte over overalt pga på samme sammen se seks selv selvom senere ser ses siden sig sige sin sine sit skal skulle som stadig stor store synes syntes syv så sådan således tag tage temmelig thi ti tidligere til tilbage tit to tre ud uden udover under undtagen var ved vor vore vores vær være været øvrigt "
stop_ord += "amp forslag sikre gerne enig debat støtte kræve problem sætte spørgsmål virkelig mulighed første nytte ville dag dansk måtte regering oppe bruge megen mangen politisk politiker gå går gange parti tak stå sag holde dk tale dansker del menneske hele verden fremme finde rette tro populær mene skriver arbejde folketing uge læsse gode tænke klare krone både håbe visere langt politik ønske følge give lov borger I åre vie talte gælder "
stop_ord += "blot besøg dybt pre nye dfs " + "én ét siger " + "dagen enkelt engang let endda fre dagen " + "løs tjek ment " + "altså " + "tråd tv " + "husk ringe tæt"

lemmatizer = lemmy.load('da')
def lemma(word):
    lem = lemmatizer.lemmatize("",word)[0]
    return lem


def stop(string):
    new_string = ""
    words = string.split(' ')
    for word in words:
        if word not in stop_ord:
            new_string += lemma(word)+' '
    return new_string

In [15]:
"""Testing the cleaning functions"""
print(data['Medlem'][129])
print(data['ALL_tweets'][129][9].created_at,'\n')

org = data['ALL_tweets'][129][9].full_text
print(org,'\n')

cleaned = clean(org)
print(cleaned,'\n')

stopped = stop(cleaned)
print(stopped,'\n')

lemmatized = ""
for word in stopped.split(' '):
    lemmatized += lemma(word) + ' '
print(lemmatized)

Kathrine Olldag
2021-04-23 05:46:04 

Vi har allerede for 4 mdr siden aftalt at "genbesøge" aftalen om grønne biler i 2025. Det vil være en god anledning til at kigge på om det stadig giver mening at  kalde hybridbiler "grønne"? 
Jeg hælder allerede nu til et NEJ! 🌱🚗⚡️
#dkpol @radikale 
https://t.co/JFRxBAgkr9 

vi har allerede for  mdr siden aftalt at genbesøge aftalen om grønne biler i  det vil være en god anledning til at kigge på om det stadig giver mening at kalde hybridbiler grønne jeg hælder allerede nu til et nej  

mdr aftalt genbesøge aftalen grønne biler anledning kigge giver mening kalde hybridbiler grønne hælder  

mdr aftale genbesøge aftale grøn bile anledning kigge giver mening kalde hybridbil grøn hælde  


In [None]:
""" This function is commented out because it doesn't remove english tweets"""

# def create_tuples(list_of_lists, parti):
#     list_of_tuples = []
#     for list1 in list_of_lists:
#         if list1 != 'NaN':
#             for tweet in list1:
#                 #list_of_tuples.append((tweet.full_text,parti))
#                 """Tilføjet cleaning functioner"""
#                 cleaned = stop(clean(tweet.full_text))
#                 """Only adds it, if there are more than 3 words"""
#                 if len(cleaned.split(' ')) > 3:
#                     list_of_tuples.append((cleaned, parti))
#     return list_of_tuples

In [20]:
"""This function creates a big list of tuples with the preprocessed texts and the party affiliation of the member who wrote it"""

def create_tuples2(list_of_lists, parti):
    list_of_tuples = []
    n=0
    for list1 in list_of_lists:
        
        if list1 != 'NaN':
            for tweet in list1:
                #list_of_tuples.append((tweet.full_text,parti))
                """Testing if the tweet is in danish"""
                n+=1
                text = tweet.full_text
                if len(text.split(' '))>3:
                    try:
                        lang = detect(text)
                    except:
                        lang == 'noget_som_ikke_er_en'
                    finally:
                        if lang != 'en':
                            """Tilføjet cleaning functioner"""
                            cleaned = stop(clean(text))
                            """Only adds it, if there are more than 3 words"""
                            if len(cleaned.split(' ')) > 3:
                                list_of_tuples.append((cleaned, parti))
    print(parti)
    print('total number of tweets:',n)
    print('total after cleaning:  ',len(list_of_tuples),'\n')
    return list_of_tuples

In [21]:
"""Here I test the function on a single party"""

V = data.loc[data['Parti'] == 'V']
tup = create_tuples2(V['ALL_tweets'],'V')
print(len(tup))
print(tup)

V
total number of tweets: 7478
total after cleaning:   6887 

6887
[('underligt forløb forhandlinger regi klimalov brede inviteret sagt lavt manglet svar sætter viden troede fælles klimaloven ', 'V'), ('svenskerne giver baghjul risikovillig kapital bedre ambitiøse iværksættere danmark ', 'V'), ('back ekskluderet twitter uger skrevet klimarådet dumper regeringen vildt desværre forventet dræbe nattelivet kbh dumt overraskende genåbningen langsomt træls ', 'V'), ('kronik skrevet overlæge psykiatrisk akutmodtagelse balancen skredet coronahåndteringen kuren nedlukning ender skade gavner nødt modige åbne gradvist ', 'V'), ('endelig samlet bud danskere lade elbil umiddelbart fornuftigt høj grad baseret markedet politikere fjerne hindringer forstår betalingsring kbh nemmere lade bil ', 'V'), ('stærkt græde forretning følelser kræver personlig involvering starte drive ondt muligt stærkt deler hils kram ', 'V'), ('aftalen energiø km nordsøen plads enige navnet navnekonkurrence twitter øen hedde 

In [27]:
"""Creating a big list containing lists of Tuples. This is the data I used for Wordclouds and LDA models
    OBS this cell takes a while to run (aprx. 20 min)"""

big_list_of_tuples = []
for parti in gode_partier:
    ram_data = data.loc[data['Parti']==parti]
    big_list_of_tuples.append(create_tuples2(ram_data['ALL_tweets'],parti))

DF
total number of tweets: 7598
total after cleaning:   6811 

EL
total number of tweets: 5332
total after cleaning:   4955 

KF
total number of tweets: 3385
total after cleaning:   3182 

RV
total number of tweets: 4654
total after cleaning:   4143 

S
total number of tweets: 13640
total after cleaning:   11480 

SF
total number of tweets: 4871
total after cleaning:   4551 

V
total number of tweets: 7478
total after cleaning:   6888 



In [28]:
"""Creating a pandas DataFrame to save the data"""

tuples_dataframe = pd.DataFrame(index = gode_partier)
tuples_dataframe['Tuples'] = big_list_of_tuples

tuples_dataframe

Unnamed: 0,Tuples
DF,[(bunds kvik test landets plejehjem prioritere...
EL,[(endelig våbenhvile israelpalæstina sikres bæ...
KF,"[(ærligt bedst orker bøvl indkøbspoletter , KF..."
RV,[(diskriminerer egne statsborgere bære selvføl...
S,[(fantastisk inspirerende møde danskere forudg...
SF,[(stop stigmatisering psykisk sygdom ufattelig...
V,[(underligt forløb forhandlinger regi klimalov...


In [None]:
"""Actually saving the data"""

tuples_dataframe.to_pickle('big_tuples_24-05.pkl')

In [35]:
"""Calculating the average number of words pr tweet in the Dataframe"""
n_words_in_each_tweet = []

for list_of_tweets in tuples_dataframe['Tuples']:
    for tweet in list_of_tweets:
        n_words_in_each_tweet.append(len(tweet[0].split(' ')))
        
average_number_of_words = sum(n_words_in_each_tweet)/len(n_words_in_each_tweet)
average_number_of_words

12.235586765055938

In [29]:
"""Counting mow many tweets are left after cleaning"""
n = 0
for list_of_tweets in tuples_dataframe['Tuples']:
    for tweet in list_of_tweets:
        n+=1
print(n)

42010


In [31]:
"""Counting the number of tweets for each party"""
for p in gode_partier:
    print(len(tuples_dataframe['Tuples'][p]))

6811
4955
3182
4143
11480
4551
6888
