# Exam Project Notebook

### Authors: 
### Bjørn Bremholm
### Laura Zeeper
### Christoffer Gade

In [2]:
# Import Packages 

import numpy as np 
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import tweepy
import webbrowser
import time
from datetime import datetime

# Suppress warning
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# Text analysis
import nltk # NLTK: A basic, popular NLP package. 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sentida import Sentida


# Header to be used in request (contact info)
header = {'Contact' : 'Christofferfoldager@gmail.com'}

# 
pd.set_option('display.max_rows', 10)

## 1. Section - Gather list of twitter accounts 

Description: In this section we will collect a dataframe containing infomation of danish MPs including their twitter accounts...

Websites: 
1. https://www.ft.dk/da/medlemmer/mandatfordelingen - used to collect parties and the official MP's of each party
2. https://filip.sdu.dk/twitter/politikere/ - used to collect the twitter accounts for each MP



In [335]:
# Collect the address link of each party
url = 'https://www.ft.dk/da/medlemmer/mandatfordelingen'

header['Info'] = 'Collecting a list of MP for research project'
page = requests.get(url, verify=False, headers=header)
soup = BeautifulSoup(page.text, 'html.parser')

# Get the links
data = soup.find_all('table')[0]
aref = data.find_all('a')
member_nr = data.find_all('div', {'class':'member-container'})

# Create a list with all relevant infomation 
Parties = []

# defining red and blue 
Red = ['Socialdemokratiet (S)','Socialistisk Folkeparti (SF)','Radikale Venstre (RV)','Enhedslisten (EL)','Alternativet (ALT)']
Blue = ['Det Konservative Folkeparti (KF)', 'Venstre (V)', 'Dansk Folkeparti (DF)', 'Det Konservative Folkeparti (KF)', 'Liberal Alliance (LA)', 'Nye Borgerlige (NB)', 'Kristendemokraterne (KD)']

for i in range(len(aref)):
    if aref[i].text in Red:
        Color = 'Red'
    elif aref[i].text in Blue:
        Color = 'Blue'
    else:
        Color = None
    Parties.append([aref[i].text,Color, aref[i].get('href'),member_nr[i].text])

# Creates a Dataframe    
df_Parties = pd.DataFrame(Parties, columns=['Party','Color','AddressLink','Members'])
df_Parties.head(5)

Unnamed: 0,Party,Color,AddressLink,Members
0,Socialdemokratiet (S),Red,/searchResults.aspx?sortedDescending=false&par...,49
1,Venstre (V),Blue,/searchResults.aspx?sortedDescending=false&par...,39
2,Dansk Folkeparti (DF),Blue,/searchResults.aspx?sortedDescending=false&par...,16
3,Socialistisk Folkeparti (SF),Red,/searchResults.aspx?sortedDescending=false&par...,15
4,Radikale Venstre (RV),Red,/searchResults.aspx?sortedDescending=false&par...,14


In [343]:
# Collect the names of each MP 
ID_list = []

for i in range(len(df_Parties)): # Loop for each party
    Party = df_Parties['Party'][i]
    Color = df_Parties['Color'][i]

    #Get the relevant party page
    link = df_Parties['AddressLink'][i]
    url = f'http://ft.dk{link}&page=1&sortedBy=&pageSize=50'
    # time.sleep(1)
    page = requests.get(url, verify=False, headers=header)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Get the names 
    tables = soup.find_all('tr',{'tabindex':0})

    for i in range(len(tables)): # Loop for each member of the given party 
        FirstName = tables[i].find_all('td')[1].text
        LastName = tables[i].find_all('a')[0].text
        Name = FirstName + ' ' + LastName
        MP = tables[i].find_all('td')[4].text

        # Create a first, last and middle name (for use in merge)
        Name_list = re.findall('\w+',Name)
        First_Name = Name_list[0]
        Last_Name = Name_list[-1]
        Middel_Name = ' '.join(Name_list[1:-1])

        ID_list.append([Name,Party,Color,MP,First_Name,Last_Name,Middel_Name])

df_Name = pd.DataFrame(ID_list, columns=['Name','Party','Color','MP','First Name','Last Name','Middel Name'])

# Give color to the people out of party
Red_Name = ['Susanne Zimmer','Sikandar Siddique','Uffe Elbæk'] 
Blue_Name = ['Simon Emil Ammitzbøll-Bille', 'Lars Løkke Rasmussen', 'Orla Østerby', 'Inger Støjberg']

df_Name['Color'] = np.where((df_Name['Name'].isin(Red_Name)),'Red',df_Name['Color'])
df_Name['Color'] = np.where((df_Name['Name'].isin(Blue_Name)),'Blue',df_Name['Color'])

df_Name.head()

Unnamed: 0,Name,Party,Color,MP,First Name,Last Name,Middel Name
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,Ida,Auken,
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,Trine,Bramsen,
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,Bjørn,Brandenborg,
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,Jeppe,Bruus,
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,Morten,Bødskov,


In [344]:
# Collect the twitter accounts 
url = 'https://filip.sdu.dk/twitter/politikere/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

data = soup.find_all('div', {'class':'col-md-6 show_tweet show_user'}) # Finds the list of all politicians twitter account

Twitter_list = []

for i in range(len(data)):
    info_i = data[i].find_all('div', {'class':'media-body'})
    twitter_id = info_i[0].find('h3').text.split()
    person_id = info_i[0].find('small').text.split(' \nmed ')

    Name = person_id[0]
    Tag = twitter_id[1]
    Følgere = re.search('[0-9.]+[0-9]|\d', person_id[1]).group()

    # Create a first, last and middle name (for merge purpose)
    Name_list = re.findall('\w+',Name)
    First_Name = Name_list[0]
    Last_Name = Name_list[-1]
    Middel_Name = ' '.join(Name_list[1:-1])

    Twitter_list.append([Name,Tag,Følgere, First_Name, Last_Name,Middel_Name])

df_Twitter_id = pd.DataFrame(Twitter_list, columns=['Name','Twitter_id','Følgere','First Name','Last Name','Middel Name'])
df_Twitter_id.head(5)

Unnamed: 0,Name,Twitter_id,Følgere,First Name,Last Name,Middel Name
0,Margrethe Vestager,@vestager,295.359,Margrethe,Vestager,
1,Lars Løkke Rasmussen,@larsloekke,198.367,Lars,Rasmussen,Løkke
2,Pernille Skipper,@PSkipperEL,82.902,Pernille,Skipper,
3,Ida Auken,@IdaAuken,73.928,Ida,Auken,
4,Kristian Jensen,@Kristian_Jensen,62.954,Kristian,Jensen,


In [345]:
# First merge effort
def merge_df(df_name,df_twitter):
    # Merge on first and last name 
    df_new = pd.merge(df_name,df_twitter[['First Name', 'Last Name','Twitter_id','Følgere']],  how='left', on=['First Name','Last Name'])
    # Merge on first and middel name
    df_new1 = pd.merge(df_name, df_twitter[['First Name', 'Last Name','Twitter_id','Følgere']],  how='left', left_on = ['First Name','Middel Name'], right_on=['First Name', 'Last Name'])

    df_new['Twitter_id'] = df_new['Twitter_id'].fillna(df_new1['Twitter_id'])
    df_new['Følgere'] = df_new['Følgere'].fillna(df_new1['Følgere'])

    # We will still miss a few Twitter_id (human error or no twitter account)
    df_new.replace(float('NaN'),'None',inplace=True)

    return df_new

df_info = merge_df(df_Name,df_Twitter_id)
twitter_missing = len(df_info.loc[df_info['Twitter_id'] == 'None'])
print ('We are missing: '+ str(twitter_missing) + ' twitter accounts')


We are missing: 32 twitter accounts


In [121]:
manuel_handles = {'Kaare Dybvad Bek':'@KaareDybvad', 'Karin Gaardsted' : '@KarinGaardsted', 'Ane Halsboe-Jørgensen':'@AneHalsboe', 
                    'Christian Rabjerg Madsen' : '@RabjergMadsen', 'Lars Aslan Rasmussen':'@lars_aslan', 'Pernille Rosenkrantz-Theil' : '@RosenkrantzT',
                    'Kasper Roug':'@KasperRoug', 'Mads  Fuglede':'@madsfuglede', 'Peter Juel-Jensen':'@PeterJuelJensen',
                    'Stén Knuth':'@Sten_Knuth',  'Lars Christian Lilleholt':'@larsclilleholt', 'Kristian Pihl Lorentzen':'@kplorentzen',
                    'Torsten Schack Pedersen':'@Torstenschack', 'Lise Bech':'@LiseBech', 'Jens Henrik Thulesen Dahl':'@JThulesen', 
                    'Mette Hjermind Dencker':'@dfmehd_mette', 'Kirsten Normann Andersen':'@KirstenNormann', 'Karina Lorentzen Dehnhardt':'@MF_K_Lorentzen',
                    'Charlotte Broman Mølbæk':'@charlottebroman', 'Rasmus Nordqvist':'@rasmusnordqvist', 'Trine Torp':'@TrineTorp', 
                    'Sofie Carsten Nielsen':'@sofiecn', 'Rasmus Helveg Petersen':'@rasmushelveg', 'Victoria Velasquez':'@VictoriaV_EL', 
                    'Katarina Ammitzbøll':'@Ammitzboell_K', 'Brigitte Klintskov Jerkel':'@JerkelK', 'Aki-Matilda Høegh-Dam': '@AkiMati_Siumut',
                    'Sjúrður Skaale': '@SjurSkaale', 'Uffe Elbæk':'@uffeelbaek'}

In [346]:
for key in manuel_handles:
    index = df_info[df_info['Name'] == key].index
    df_info.iloc[index[0],7] = manuel_handles[key]

twitter_missing = len(df_info.loc[df_info['Twitter_id'] == 'None'])
print ('We are missing: '+ str(twitter_missing) + ' twitter accounts')
df_info.head()

We are missing: 16 twitter accounts


Unnamed: 0,Name,Party,Color,MP,First Name,Last Name,Middel Name,Twitter_id,Følgere
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,Ida,Auken,,@IdaAuken,73.928
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,Trine,Bramsen,,@Trinebramsen,13.517
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,Bjørn,Brandenborg,,@BjBrandenborg,2.278
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,Jeppe,Bruus,,@JeppeBruus,4.069
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,Morten,Bødskov,,@mfMorten,15.328


In [348]:
df_data = df_info.copy()

# Dropping columns
Columns = ['First Name', 'Last Name','Middel Name']
df_data = df_data.drop(columns=Columns, axis=1)

# Dropping those without twitter_id
df_data = df_data[df_data.Twitter_id != 'None']

# Dropping MPs from greenland and faroe islands
df_data = df_data[df_data.Color != 'None']

# Reindex
df_data = df_data.reset_index(drop=True)

# Create y binary
df_data['y'] = np.where((df_data['Color'] == 'Blue'),1,0)

df_data.head()


Unnamed: 0,Name,Party,Color,MP,Twitter_id,Følgere,y
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,@IdaAuken,73.928,0
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,@Trinebramsen,13.517,0
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,@BjBrandenborg,2.278,0
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,@JeppeBruus,4.069,0
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,@mfMorten,15.328,0


## Section 2 - Collecting Tweets


In [133]:
# Get authentication by twitter and access to their API 
consumer_key = 'k5bWZZyOPUwPYs1BCpSEoBNHQ'
consumer_secret = 'g6HxSQyhhBuZjwWyGcNornDFL7wkmz4wAabdJqqnJyk10T3Q6G'
callback_url = 'oob'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret, callback_url)

try:
    redirect_url = auth.get_authorization_url()
except tweepy.TweepError:
    print('Error! Failed to get request token.')

webbrowser.open(redirect_url)

user_pin_input = input('What is the pin value? ')
auth.get_access_token(user_pin_input)
api = tweepy.API(auth)

In [324]:
# Function gathering the tweets from a specific user/handle
def extract_tweets(handle,name):

    # Input: Handle/user (e.g. @JeppeKofod) which is used to collect tweets
    #
    # Output: Dataframe containing tweets with attaching information 

    # tweets = api.user_timeline(screen_name=handle,count=40,exclude_replies=True,include_rts=False,tweet_mode="extended") 
    tweets = tweepy.Cursor(api.user_timeline,screen_name = handle, tweet_mode='extended', include_rts=False, exclude_replies=True).items()
    tweets_data = []

    for tweet in tweets:
        date = tweet.created_at
        author = tweet.author.screen_name
        likes = tweet.favorite_count
        retweets = tweet.retweet_count
        source = tweet.source
        text = tweet.full_text
        lang = tweet.lang

        tweets_data.append([date,author,name,text,lang,likes,retweets,source])

    df_tweets = pd.DataFrame(tweets_data, columns=['Date','Author','Name','Tweet','Language','Likes','Retweets','Source'])
    return df_tweets 

# Get dataframe of tweets and updated df_data
def get_tweets_df(df):
    
    df = df.reset_index(drop=True)

    df_out = pd.DataFrame()
    tweet_count = []

    for i in range(len(df)):
        
        # Find user
        user = df['Twitter_id'][i]
        name = df['Name'][i]
        
        # Collects tweets from user
        df_tweets = extract_tweets(user,name)
        df_out = df_out.append(df_tweets)

        # Collect total of tweets
        tweet_count.append(len(df_tweets))
        
        # Request limit time sleeper
        time.sleep(90)
    
    df_out = df_out.reset_index(drop=True)
    df['Tweet Count'] = tweet_count

    return df_out,df


In [340]:
########################
dont fucking run this
######################
df1 = df_data.copy()

# Get twitter data
df_twitter_data, df_Name_list = get_tweets_df(df1)

# Saving the dataframes
df_Name_list.to_csv('Name_id_Final.csv')
df_twitter_data.to_csv('Twitter_data_Final.csv')

# Print
df_twitter_data.head(20)

SyntaxError: invalid syntax (<ipython-input-340-0f6344b62595>, line 2)

## 3. Procession text data

In [271]:
# We will collect our saved data
df_Name_id_list = pd.read_csv('Name_id_Final.csv')
df_twitter_data = pd.read_csv('Twitter_data_Final.csv')

pd.set_option('display.max_rows', 20)

In [272]:
# Mergeing the two dataframes 

# Dropping people with out tweets
df_Name_id_list.sort_values('Tweet Count')
a = df_Name_id_list[df_Name_id_list['Tweet Count'] == 0]['Name'].to_list()
print('People with no Tweets: ' + str(a))
df_Name_id_list = df_Name_id_list[~df_Name_id_list['Name'].isin(a)]

# Dropping duplicates
dup = df_Name_id_list[df_Name_id_list.duplicated('Name')]['Name'].to_list()
print('People who enters more than once: ' + str(dup))
df_Name_id_list = df_Name_id_list.drop_duplicates('Name',keep='first')

# Merge certain columns
Col_Name = ['Name','Party','MP','Color','Twitter_id','Følgere','Tweet Count','y']
Col_Tweet = ['Date','Name','Tweet','Language','Likes','Retweets','Source']

# Merge
df_main_preprocessed = pd.merge(df_twitter_data[Col_Tweet],df_Name_id_list[Col_Name],how='left',on='Name')

# Remove english tweets
a = len(df_main_preprocessed)
df_main_preprocessed = df_main_preprocessed[df_main_preprocessed['Language'] != 'en'].reset_index(drop=True)
print('English tweets removed: ' + str(a-len(df_main_preprocessed)))

# Print df
df_main_preprocessed

People with no Tweets: ['Nick Hækkerup', 'Jan Johansen', 'Bjarne Laustsen']
People who enters more than once: ['Søren Egge Rasmussen']


In [None]:
# Function to process the individual tweet
def preprocess(text):
    
    text = re.sub('http\S*','', text) # remove links

    if len(text) in [0,1,2,3,4,5,6] : return ['None']*3

    SV = Sentida()
    sentiment_mean_score = SV.sentida(text, output='mean', normal=True)

    text = re.sub(r'[^\w\#\s]','', text) 
    text = re.sub('\s[0-9]','', text) 

    tokens = nltk.TweetTokenizer().tokenize(text.lower()) # Keeps hashtag

    stop_words_list = nltk.corpus.stopwords.words("danish")
    lemmas = [i for i in tokens if i not in stop_words_list]

    text_new = ' '.join(lemmas)

    return lemmas, text_new, np.round(sentiment_mean_score,2)

# Function that will process every tweet
def process_tweets_data(df):
    
    info_list = []

    for i in range(len(df)):
        text = df['Tweet'][i]
        # print(i)
        stems, final_text, sentiment_mean = preprocess(text)
        
        # Gather infomation to df
        Date = df['Date'][i]
        Name = df['Name'][i]
        Language = df['Language'][i]
        Likes = df['Likes'][i]
        Retweets = df['Retweets'][i]
        Følgere = df['Følgere'][i]
        Tweet_count = df['Tweet Count'][i]
        y = df['y'][i]
        Party = df['Party'][i]

        # Append list
        info_list.append([Date, Name, Party ,Følgere,Tweet_count,text,Language,Likes,Retweets,stems,final_text,sentiment_mean,y])
    
    # Create columns names
    Col_name = ['Date','Name','Party','Følgere','Tweet Count','Tweet','la','Likes','Retweets','Stems','Final Text','Sentiment','y']

    # Create new dataframe
    df_out = pd.DataFrame(info_list, columns=Col_name)
    return df_out


In [276]:

def preprocess(text):
    # lemmas = text.lower()
    text = re.sub('https:[\s\S]*$','', text) # remove links
    text = re.sub(r'[^\w\#\s]','', text) # should we leave hashtag or not

    tokens = nltk.TweetTokenizer().tokenize(text.lower())

    # tokens = nltk.word_tokenize(text.lower())
    # wnl = nltk.WordNetLemmatizer()
    # lemmas = [wnl.lemmatize(t) for t in tokens]

    stop_words_list = nltk.corpus.stopwords.words("danish")
    lemmas = [i for i in tokens if i not in stop_words_list]

    return lemmas # return a list of stems/lemmas


def process_tweets_text(df):

    tweet_process = []


    for i in range(len(df['Tweet'])):
        text = df['Tweet'][i]
        Processed_text = preprocess(text)
        Final_text = ' '.join(Processed_text)

        tweet_process.append([df['Date'][i],df['Name'][i],text,Processed_text,Final_text,df['Language'][i]])

    df_out = pd.DataFrame(tweet_process, columns=['Date','Name','text','Tweet Processed','Final_text','Language'])
    
    return df_out

In [None]:
# Example of the text process
text= df_main_preprocessed['Tweet'][4]

results = preprocess(text)
 
# Text
print('Old text: \n' + text  + '\n')
print('New text (text): ' + str(len(results[1])) + ' list length \n' + str(results[1]) + '\n')
print('New text (Stems): ' + str(len(results[0])) + ' list length \n' + str(results[0]) + '\n')

# Sentiment scores
print('The mean sentiment score is: ' + str(results[2]))


In [None]:
# Create a main dataframe (approx = 6-700 sec.)
df_main_backup = process_tweets_data(df_main_preprocessed)
df_main = df_main_backup.copy()
df_main

In [None]:
# Delete rows with no final text (pictures and emoji)
df_main_done = df_main[df_main['Stems']!='None'].reset_index(drop=True)

# Setting dates to after 2015-01-01
df_main_done['Date'] = pd.to_datetime(df_main_done['Date'],format = '%Y-%m-%d %H:%M:%S')
df_main_done = df_main_done.loc[df_main_done['Date'] >= '2015-1-01'].reset_index(drop=True)


In [None]:
# Create dataframes (Election-periods, the 6-month preelection, corona period)
# Total 
df_blue = df_main_done[df_main_done['y']==1]
df_red  = df_main_done[df_main_done['y']==0]

xb = np.array(df_blue['Sentiment'].to_list())
xr = np.array(df_red['Sentiment'].to_list())

# 2015 election period
df_2017_valg = df_main_done.loc[df_main_done['Date'] >= '2015-06-18'].reset_index(drop=True)
df_2017_valg = df_2017_valg.loc[df_2017_valg['Date'] <= '2019-06-05'].reset_index(drop=True)

df_blue_2017 = df_2017_valg[df_2017_valg['y']==1]
df_red_2017  = df_2017_valg[df_2017_valg['y']==0]

xb_2017 = np.array(df_blue_2017['Sentiment'].to_list())
xr_2017 = np.array(df_red_2017['Sentiment'].to_list())


# 2019 election period
df_2019_valg = df_main_done.loc[df_main_done['Date'] >= '2019-06-05'].reset_index(drop=True)

df_blue_2019 = df_2019_valg[df_2019_valg['y']==1]
df_red_2019  = df_2019_valg[df_2019_valg['y']==0]

xb_2019 = np.array(df_blue_2019['Sentiment'].to_list())
xr_2019 = np.array(df_red_2019['Sentiment'].to_list())

# 2015 election pre period
df_2017_valg_pre = df_main_done.loc[df_main_done['Date'] <= '2015-06-18'].reset_index(drop=True)
df_2017_valg_pre = df_2017_valg_pre.loc[df_2017_valg_pre['Date'] >= '2015-01-01'].reset_index(drop=True)

df_blue_2017_pre = df_2017_valg_pre[df_2017_valg_pre['y']==1]
df_red_2017_pre  = df_2017_valg_pre[df_2017_valg_pre['y']==0]

xb_2017_pre = np.array(df_blue_2017_pre['Sentiment'].to_list())
xr_2017_pre = np.array(df_red_2017_pre['Sentiment'].to_list())

# 2019 election pre period
df_2019_valg_pre = df_main_done.loc[df_main_done['Date'] <= '2019-06-05'].reset_index(drop=True)
df_2019_valg_pre = df_main_done.loc[df_main_done['Date'] >= '2019-01-01'].reset_index(drop=True)

df_blue_2019_pre = df_2019_valg_pre[df_2019_valg_pre['y']==1]
df_red_2019_pre  = df_2019_valg_pre[df_2019_valg_pre['y']==0]

xb_2019_pre = np.array(df_blue_2019_pre['Sentiment'].to_list())
xr_2019_pre = np.array(df_red_2019_pre['Sentiment'].to_list())

# corona period
df_corona = df_main_done.loc[df_main_done['Date'] >= '2020-03-11'].reset_index(drop=True)

df_blue_corona = df_corona[df_corona['y']==1]
df_red_corona  = df_corona[df_corona['y']==0]

xb_corona = np.array(df_blue_corona['Sentiment'].to_list())
xr_corona = np.array(df_red_corona['Sentiment'].to_list())


# Create Plots 

fig = plt.figure(figsize=(8, 4), dpi=200)
fig.subplots_adjust(hspace = 0.5)

ax_Total = fig.add_subplot(2,3,1)
ax_Total.hist(xb,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_Total.hist(xr,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_Total.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_Total.set_title('Total',size = 'small')

ax_2019 = fig.add_subplot(2,3,2)
ax_2019.hist(xb_2019,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2019.hist(xr_2019,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2019.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_2019.set_title('5. June 2019 - Present',size='small')

ax_2017 = fig.add_subplot(2,3,3)
ax_2017.hist(xb_2017,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2017.hist(xr_2017,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2017.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_2017.set_title('18 June 2015 - 5. June 2019',size='small')

ax_Corona = fig.add_subplot(2,3,4)
ax_Corona.hist(xb_corona,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_Corona.hist(xr_corona,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_Corona.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_Corona.set_title('Corona',size = 'small')

ax_2019_pre = fig.add_subplot(2,3,5)
ax_2019_pre.hist(xb_2019_pre,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2019_pre.hist(xr_2019_pre,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2019_pre.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_2019_pre.set_title('1. January 2019 - 5. June 2019',size='small')

ax_2017_pre = fig.add_subplot(2,3,6)
ax_2017_pre.hist(xb_2017_pre,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2017_pre.hist(xr_2017_pre,bins=50, histtype= 'stepfilled',alpha= 0.3,density=True)
ax_2017_pre.axvline(0,color='black',linestyle='--',linewidth=0.5)
ax_2017_pre.set_title('1. January 2015 - 18. June 2015',size='small')

In [None]:
# Create a 6 month running average sentiment score across the blocks
def six_month_average(df):
    df_blue = df[df['y']==1]
    df_red = df[df['y']==0]
    blue_sentiment_average_month = []
    red_sentiment_average_month = []
    dates = []
    

    date_list = df['Date'].sort_values().reset_index(drop=True).to_list()
    year, month = date_list[0].year, date_list[0].month
    end_year, end_month = date_list[-1].year, date_list[-1].month

    while (year < end_year) or (month < end_month):
        if month > 12: # execute next year
            year = year + 1 # Dont touch
            month = 1 # Dont touch
        else:
            date = str(year) + '-' + str(month) + '-01'
            if month == 12:
                date_next = str(year+1) + '-1-01'
            else:
                date_next = str(year) + '-' + str(month+1) + '-01'

            dates.append(date)
            
            df_blue_month = df_blue.loc[df_blue['Date'] >= date].reset_index(drop=True)
            df_blue_month = df_blue_month.loc[df_blue_month['Date'] < date_next].reset_index(drop=True)
            df_red_month = df_red.loc[df_red['Date'] >= date].reset_index(drop=True)
            df_red_month = df_red_month.loc[df_red_month['Date'] < date_next].reset_index(drop=True)

            blue_sentiment_average_month.append(df_blue_month['Sentiment'].mean())
            red_sentiment_average_month.append(df_red_month['Sentiment'].mean())

            
            month = month +1 # Dont touch

    blue_moving_average = np.convolve(np.array(blue_sentiment_average_month), np.ones(7), 'valid')/7
    red_moving_average = np.convolve(np.array(red_sentiment_average_month), np.ones(7), 'valid')/7
         

    return blue_moving_average, red_moving_average, blue_sentiment_average_month, red_sentiment_average_month, dates

In [None]:
a = six_month_average(df_main_done)
dates = a[4][3:-3]


fig1 = plt.figure(figsize=(6, 5), dpi=100)


x_blue = np.array(np.arange(len(a[0])))
y_blue = np.array(a[0])
x_red = np.array(np.arange(len(a[1])))
y_red = np.array(a[1])

plt.xticks(x[::12], dates[::12],fontsize=8)

plt.plot(x_blue, y_blue,color='blue')
plt.plot(x_red, y_red,color='red')
plt.title('Sentiment Score (7 month running average)')
plt.axvline(2,linestyle = '--', linewidth = 1, color='black')
plt.axvline(50,linestyle = '--', linewidth = 1, color='black',label='Election')
plt.axvline(59,linestyle = ':', linewidth = 1, color='black',label='Corona')
plt.legend(loc='upper left')
plt.show()

In [None]:
def get_description(array):
    info = []
    obs = stats.describe(array).nobs
    mean = stats.describe(array).mean
    min = stats.describe(array).minmax[0]
    max = stats.describe(array).minmax[1]
    var = stats.describe(array).variance
    skewness = stats.describe(array).skewness
    kurtosis = stats.describe(array).kurtosis

    info.append([obs,mean,var,min,max,skewness,kurtosis])
    return info[0]


db_total, dr_total = get_description(xb), get_description(xr)
db_2019, dr_2019 = get_description(xb_2019), get_description(xr_2019)
db_2017, dr_2017 = get_description(xb_2017), get_description(xr_2017)
db_corona, dr_corona = get_description(xb_corona), get_description(xr_corona)
db_2019_pre, dr_2019_pre = get_description(xb_2019_pre), get_description(xr_2019_pre)
db_2017_pre, dr_2017_pre = get_description(xb_2017_pre), get_description(xr_2017_pre)

time = np.array(['Blå (Total)','Rød (Total)','Blå (corona)','Rød (corona)','Blå (2019-nu)','Rød (2019-nu)','Blå (2019 pre election)','Rød (2019 pre election)','Blå (2015-2019)','Rød (2015-2019)','Blå (2015 pre election)','Rød (2015 pre election)'])
df_info_sentiment = pd.DataFrame([db_total,dr_total,db_corona,dr_corona,db_2019,dr_2019,db_2019_pre, dr_2019_pre, db_2017,dr_2017,db_2017_pre,dr_2017_pre],columns=(['obs','mean','variance','min','max','skeness','kurtosis'])).set_index(time,drop=True)
df_info_sentiment

In [None]:
def find_hashtags(df):
    """
    Function that finds every hashtag in the column 'Tweet' and count the appearances
    """
    counter_hashtag = Counter()

    for i in range(len(df)):
        text = df['Tweet'][i]
        find_h = re.findall('\#\S*',text)
        new = Counter(find_h)
        counter_hashtag = counter_hashtag + new
    
    df_out = pd.DataFrame(dict(counter_hashtag).items(), columns=['Word','Count'])
    return df_out

In [None]:
text = df_main_done.copy()

text = text.loc[text['Date'] >= '2019-06-05'].reset_index(drop=True)

# text = text.loc[text['Date'] >= '2020-03-11'].reset_index(drop=True)
# text = text.loc[text['Date'] <= '2020-12-31'].reset_index(drop=True)

df_hashtags_2019 = find_hashtags(text)

df_hashtags_2019 = df_hashtags_2019.sort_values('Count',ascending=False).reset_index(drop=True)
df_hashtags_2019[0:20]

In [None]:
# Create a new test dataframe for ML 
test = df_main_done.copy()

list_of_words= df_hashtags_2019['Word'][0:20].to_list()

pattern = '|'.join(list_of_words)

df_test = test[test['Final Text'].str.contains(pattern)]
df_test = df_test.loc[df_test['Date'] >= '2019-06-05'].reset_index(drop=True)

df_test

In [None]:
# LONG TWEETS
X, y = df_test['Final Text'], df_test['y']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

vect = CountVectorizer(ngram_range=(1,4), max_features = 200)

X_train = vect.fit_transform(X_train)
X_test = vect.fit_transform(X_test)
print("We have " + str(np.round(sum(y_train)/len(y_train)*100,2)) + '% of blue tweets in our train')
print("We have " + str(np.round(sum(y_test)/len(y_test)*100,2)) + '% of blue tweets in our test')

In [None]:
model = LogisticRegression(max_iter = 1000, solver = 'saga', penalty ='l1', fit_intercept=True ).fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred = y_pred

In [None]:
print('Our testset is: ' + str(len(y_test)) + ' observation \nTrue: ' + str(y_test.sum()) + ' Blue tweets' )
a = y_pred == y_test
print('We found: ' + str(y_pred.sum()) + ' Blue tweets \nWe predicted: ' + str(np.round(a.mean()*100,2)) + ' % correct tweets overall')
a = Counter(y_pred-y_test)

print('\nWe missed to predict: ' + str(a[-1]) + ' Tweets \n\nOf the predicted Blue (' + str(y_pred.sum()) + ') \nCorrect prediction: ' + str(y_pred.sum()-a[1]) + '\nWrong prediction: ' + str(a[1]))

print('\nOf the predicted Red (' + str(len(y_pred) - y_pred.sum()) + ') \nCorrect prediction: ' + str(len(y_pred) - y_pred.sum()-a[-1]) + '\nWrong prediction: ' + str(a[-1]))

