# Exam Project Notebook

### Authors: 
### Bjørn Bremholm
### Laura Zeeper
### Christoffer Gade

In [1]:
# Import Packages 

import numpy as np 
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import tweepy
import webbrowser
import time
from datetime import datetime

# Suppress warning
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# Text analysis
import nltk # NLTK: A basic, popular NLP package. 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


# Header to be used in request (contact info)
header = {'Contact' : 'Christofferfoldager@gmail.com'}

# 
pd.set_option('display.max_rows', 10)

## 1. Section - Gather list of twitter accounts 

Description: In this section we will collect a dataframe containing infomation of danish MPs including their twitter accounts...

Websites: 
1. https://www.ft.dk/da/medlemmer/mandatfordelingen - used to collect parties and the official MP's of each party
2. https://filip.sdu.dk/twitter/politikere/ - used to collect the twitter accounts for each MP



In [335]:
# Collect the address link of each party
url = 'https://www.ft.dk/da/medlemmer/mandatfordelingen'

header['Info'] = 'Collecting a list of MP for research project'
page = requests.get(url, verify=False, headers=header)
soup = BeautifulSoup(page.text, 'html.parser')

# Get the links
data = soup.find_all('table')[0]
aref = data.find_all('a')
member_nr = data.find_all('div', {'class':'member-container'})

# Create a list with all relevant infomation 
Parties = []

# defining red and blue 
Red = ['Socialdemokratiet (S)','Socialistisk Folkeparti (SF)','Radikale Venstre (RV)','Enhedslisten (EL)','Alternativet (ALT)']
Blue = ['Det Konservative Folkeparti (KF)', 'Venstre (V)', 'Dansk Folkeparti (DF)', 'Det Konservative Folkeparti (KF)', 'Liberal Alliance (LA)', 'Nye Borgerlige (NB)', 'Kristendemokraterne (KD)']

for i in range(len(aref)):
    if aref[i].text in Red:
        Color = 'Red'
    elif aref[i].text in Blue:
        Color = 'Blue'
    else:
        Color = None
    Parties.append([aref[i].text,Color, aref[i].get('href'),member_nr[i].text])

# Creates a Dataframe    
df_Parties = pd.DataFrame(Parties, columns=['Party','Color','AddressLink','Members'])
df_Parties.head(5)

Unnamed: 0,Party,Color,AddressLink,Members
0,Socialdemokratiet (S),Red,/searchResults.aspx?sortedDescending=false&par...,49
1,Venstre (V),Blue,/searchResults.aspx?sortedDescending=false&par...,39
2,Dansk Folkeparti (DF),Blue,/searchResults.aspx?sortedDescending=false&par...,16
3,Socialistisk Folkeparti (SF),Red,/searchResults.aspx?sortedDescending=false&par...,15
4,Radikale Venstre (RV),Red,/searchResults.aspx?sortedDescending=false&par...,14


In [343]:
# Collect the names of each MP 
ID_list = []

for i in range(len(df_Parties)): # Loop for each party
    Party = df_Parties['Party'][i]
    Color = df_Parties['Color'][i]

    #Get the relevant party page
    link = df_Parties['AddressLink'][i]
    url = f'http://ft.dk{link}&page=1&sortedBy=&pageSize=50'
    # time.sleep(1)
    page = requests.get(url, verify=False, headers=header)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Get the names 
    tables = soup.find_all('tr',{'tabindex':0})

    for i in range(len(tables)): # Loop for each member of the given party 
        FirstName = tables[i].find_all('td')[1].text
        LastName = tables[i].find_all('a')[0].text
        Name = FirstName + ' ' + LastName
        MP = tables[i].find_all('td')[4].text

        # Create a first, last and middle name (for use in merge)
        Name_list = re.findall('\w+',Name)
        First_Name = Name_list[0]
        Last_Name = Name_list[-1]
        Middel_Name = ' '.join(Name_list[1:-1])

        ID_list.append([Name,Party,Color,MP,First_Name,Last_Name,Middel_Name])

df_Name = pd.DataFrame(ID_list, columns=['Name','Party','Color','MP','First Name','Last Name','Middel Name'])

# Give color to the people out of party
Red_Name = ['Susanne Zimmer','Sikandar Siddique','Uffe Elbæk'] 
Blue_Name = ['Simon Emil Ammitzbøll-Bille', 'Lars Løkke Rasmussen', 'Orla Østerby', 'Inger Støjberg']

df_Name['Color'] = np.where((df_Name['Name'].isin(Red_Name)),'Red',df_Name['Color'])
df_Name['Color'] = np.where((df_Name['Name'].isin(Blue_Name)),'Blue',df_Name['Color'])

df_Name.head()

Unnamed: 0,Name,Party,Color,MP,First Name,Last Name,Middel Name
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,Ida,Auken,
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,Trine,Bramsen,
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,Bjørn,Brandenborg,
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,Jeppe,Bruus,
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,Morten,Bødskov,


In [344]:
# Collect the twitter accounts 
url = 'https://filip.sdu.dk/twitter/politikere/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

data = soup.find_all('div', {'class':'col-md-6 show_tweet show_user'}) # Finds the list of all politicians twitter account

Twitter_list = []

for i in range(len(data)):
    info_i = data[i].find_all('div', {'class':'media-body'})
    twitter_id = info_i[0].find('h3').text.split()
    person_id = info_i[0].find('small').text.split(' \nmed ')

    Name = person_id[0]
    Tag = twitter_id[1]
    Følgere = re.search('[0-9.]+[0-9]|\d', person_id[1]).group()

    # Create a first, last and middle name (for merge purpose)
    Name_list = re.findall('\w+',Name)
    First_Name = Name_list[0]
    Last_Name = Name_list[-1]
    Middel_Name = ' '.join(Name_list[1:-1])

    Twitter_list.append([Name,Tag,Følgere, First_Name, Last_Name,Middel_Name])

df_Twitter_id = pd.DataFrame(Twitter_list, columns=['Name','Twitter_id','Følgere','First Name','Last Name','Middel Name'])
df_Twitter_id.head(5)

Unnamed: 0,Name,Twitter_id,Følgere,First Name,Last Name,Middel Name
0,Margrethe Vestager,@vestager,295.359,Margrethe,Vestager,
1,Lars Løkke Rasmussen,@larsloekke,198.367,Lars,Rasmussen,Løkke
2,Pernille Skipper,@PSkipperEL,82.902,Pernille,Skipper,
3,Ida Auken,@IdaAuken,73.928,Ida,Auken,
4,Kristian Jensen,@Kristian_Jensen,62.954,Kristian,Jensen,


In [345]:
# First merge effort
def merge_df(df_name,df_twitter):
    # Merge on first and last name 
    df_new = pd.merge(df_name,df_twitter[['First Name', 'Last Name','Twitter_id','Følgere']],  how='left', on=['First Name','Last Name'])
    # Merge on first and middel name
    df_new1 = pd.merge(df_name, df_twitter[['First Name', 'Last Name','Twitter_id','Følgere']],  how='left', left_on = ['First Name','Middel Name'], right_on=['First Name', 'Last Name'])

    df_new['Twitter_id'] = df_new['Twitter_id'].fillna(df_new1['Twitter_id'])
    df_new['Følgere'] = df_new['Følgere'].fillna(df_new1['Følgere'])

    # We will still miss a few Twitter_id (human error or no twitter account)
    df_new.replace(float('NaN'),'None',inplace=True)

    return df_new

df_info = merge_df(df_Name,df_Twitter_id)
twitter_missing = len(df_info.loc[df_info['Twitter_id'] == 'None'])
print ('We are missing: '+ str(twitter_missing) + ' twitter accounts')


We are missing: 32 twitter accounts


In [121]:
manuel_handles = {'Kaare Dybvad Bek':'@KaareDybvad', 'Karin Gaardsted' : '@KarinGaardsted', 'Ane Halsboe-Jørgensen':'@AneHalsboe', 
                    'Christian Rabjerg Madsen' : '@RabjergMadsen', 'Lars Aslan Rasmussen':'@lars_aslan', 'Pernille Rosenkrantz-Theil' : '@RosenkrantzT',
                    'Kasper Roug':'@KasperRoug', 'Mads  Fuglede':'@madsfuglede', 'Peter Juel-Jensen':'@PeterJuelJensen',
                    'Stén Knuth':'@Sten_Knuth',  'Lars Christian Lilleholt':'@larsclilleholt', 'Kristian Pihl Lorentzen':'@kplorentzen',
                    'Torsten Schack Pedersen':'@Torstenschack', 'Lise Bech':'@LiseBech', 'Jens Henrik Thulesen Dahl':'@JThulesen', 
                    'Mette Hjermind Dencker':'@dfmehd_mette', 'Kirsten Normann Andersen':'@KirstenNormann', 'Karina Lorentzen Dehnhardt':'@MF_K_Lorentzen',
                    'Charlotte Broman Mølbæk':'@charlottebroman', 'Rasmus Nordqvist':'@rasmusnordqvist', 'Trine Torp':'@TrineTorp', 
                    'Sofie Carsten Nielsen':'@sofiecn', 'Rasmus Helveg Petersen':'@rasmushelveg', 'Victoria Velasquez':'@VictoriaV_EL', 
                    'Katarina Ammitzbøll':'@Ammitzboell_K', 'Brigitte Klintskov Jerkel':'@JerkelK', 'Aki-Matilda Høegh-Dam': '@AkiMati_Siumut',
                    'Sjúrður Skaale': '@SjurSkaale', 'Uffe Elbæk':'@uffeelbaek'}

In [346]:
for key in manuel_handles:
    index = df_info[df_info['Name'] == key].index
    df_info.iloc[index[0],7] = manuel_handles[key]

twitter_missing = len(df_info.loc[df_info['Twitter_id'] == 'None'])
print ('We are missing: '+ str(twitter_missing) + ' twitter accounts')
df_info.head()

We are missing: 16 twitter accounts


Unnamed: 0,Name,Party,Color,MP,First Name,Last Name,Middel Name,Twitter_id,Følgere
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,Ida,Auken,,@IdaAuken,73.928
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,Trine,Bramsen,,@Trinebramsen,13.517
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,Bjørn,Brandenborg,,@BjBrandenborg,2.278
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,Jeppe,Bruus,,@JeppeBruus,4.069
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,Morten,Bødskov,,@mfMorten,15.328


In [348]:
df_data = df_info.copy()

# Dropping columns
Columns = ['First Name', 'Last Name','Middel Name']
df_data = df_data.drop(columns=Columns, axis=1)

# Dropping those without twitter_id
df_data = df_data[df_data.Twitter_id != 'None']

# Dropping MPs from greenland and faroe islands
df_data = df_data[df_data.Color != 'None']

# Reindex
df_data = df_data.reset_index(drop=True)

# Create y binary
df_data['y'] = np.where((df_data['Color'] == 'Blue'),1,0)

df_data.head()


Unnamed: 0,Name,Party,Color,MP,Twitter_id,Følgere,y
0,Ida Auken,Socialdemokratiet (S),Red,Medlem af Folketinget,@IdaAuken,73.928,0
1,Trine Bramsen,Socialdemokratiet (S),Red,Medlem af Folketinget,@Trinebramsen,13.517,0
2,Bjørn Brandenborg,Socialdemokratiet (S),Red,Medlem af Folketinget,@BjBrandenborg,2.278,0
3,Jeppe Bruus,Socialdemokratiet (S),Red,Medlem af Folketinget,@JeppeBruus,4.069,0
4,Morten Bødskov,Socialdemokratiet (S),Red,Medlem af Folketinget,@mfMorten,15.328,0


## Section 2 - Collecting Tweets


In [133]:
# Get authentication by twitter and access to their API 
consumer_key = 'k5bWZZyOPUwPYs1BCpSEoBNHQ'
consumer_secret = 'g6HxSQyhhBuZjwWyGcNornDFL7wkmz4wAabdJqqnJyk10T3Q6G'
callback_url = 'oob'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret, callback_url)

try:
    redirect_url = auth.get_authorization_url()
except tweepy.TweepError:
    print('Error! Failed to get request token.')

webbrowser.open(redirect_url)

user_pin_input = input('What is the pin value? ')
auth.get_access_token(user_pin_input)
api = tweepy.API(auth)

In [324]:
# Function gathering the tweets from a specific user/handle
def extract_tweets(handle,name):

    # Input: Handle/user (e.g. @JeppeKofod) which is used to collect tweets
    #
    # Output: Dataframe containing tweets with attaching information 

    # tweets = api.user_timeline(screen_name=handle,count=40,exclude_replies=True,include_rts=False,tweet_mode="extended") 
    tweets = tweepy.Cursor(api.user_timeline,screen_name = handle, tweet_mode='extended', include_rts=False, exclude_replies=True).items()
    tweets_data = []

    for tweet in tweets:
        date = tweet.created_at
        author = tweet.author.screen_name
        likes = tweet.favorite_count
        retweets = tweet.retweet_count
        source = tweet.source
        text = tweet.full_text
        lang = tweet.lang

        tweets_data.append([date,author,name,text,lang,likes,retweets,source])

    df_tweets = pd.DataFrame(tweets_data, columns=['Date','Author','Name','Tweet','Language','Likes','Retweets','Source'])
    return df_tweets 

# Get dataframe of tweets and updated df_data
def get_tweets_df(df):
    
    df = df.reset_index(drop=True)

    df_out = pd.DataFrame()
    tweet_count = []

    for i in range(len(df)):
        
        # Find user
        user = df['Twitter_id'][i]
        name = df['Name'][i]
        
        # Collects tweets from user
        df_tweets = extract_tweets(user,name)
        df_out = df_out.append(df_tweets)

        # Collect total of tweets
        tweet_count.append(len(df_tweets))
        
        # Request limit time sleeper
        time.sleep(90)
    
    df_out = df_out.reset_index(drop=True)
    df['Tweet Count'] = tweet_count

    return df_out,df


In [340]:
########################
dont fucking run this
######################
df1 = df_data.copy()

# Get twitter data
df_twitter_data, df_Name_list = get_tweets_df(df1)

# Saving the dataframes
df_Name_list.to_csv('Name_id_Final.csv')
df_twitter_data.to_csv('Twitter_data_Final.csv')

# Print
df_twitter_data.head(20)

SyntaxError: invalid syntax (<ipython-input-340-0f6344b62595>, line 2)

## 3. Procession text data

In [107]:
# We will collect our saved data
df_Name_id_list = pd.read_csv('Name_id_Final.csv')
df_twitter_data = pd.read_csv('Twitter_data_Final.csv')

pd.set_option('display.max_rows', 20)

In [162]:
df_Name_id = df_Name_id_list.copy()

# Dropping people with out tweets
df_Name_id.sort_values('Tweet Count')
a = df_Name_id[df_Name_id['Tweet Count'] == 0]['Name'].to_list()
print('People with no Tweets: ' + str(a))
df_Name_id = df_Name_id[~df_Name_id['Name'].isin(a)]

# Dropping duplicates
dup = df_Name_id[df_Name_id.duplicated('Name')]['Name'].to_list()
print('People who enters more than once: ' + str(dup))
df_Name_id = df_Name_id.drop_duplicates('Name',keep='first')


People with no Tweets: ['Nick Hækkerup', 'Jan Johansen', 'Bjarne Laustsen']
People who enters more than once: ['Søren Egge Rasmussen']


159

In [109]:

def preprocess(text):
    # lemmas = text.lower()
    text = re.sub('https:[\s\S]*$','', text) # remove links
    text = re.sub(r'[^\w\#\s]','', text) # should we leave hashtag or not

    tokens = nltk.TweetTokenizer().tokenize(text.lower())

    # tokens = nltk.word_tokenize(text.lower())
    # wnl = nltk.WordNetLemmatizer()
    # lemmas = [wnl.lemmatize(t) for t in tokens]

    stop_words_list = nltk.corpus.stopwords.words("danish")
    lemmas = [i for i in tokens if i not in stop_words_list]

    return lemmas # return a list of stems/lemmas


def process_tweets_text(df):

    tweet_process = []


    for i in range(len(df['Tweet'])):
        text = df['Tweet'][i]
        Processed_text = preprocess(text)
        Final_text = ' '.join(Processed_text)

        tweet_process.append([df['Date'][i],df['Name'][i],Processed_text,Final_text,df['Language'][i]])

    df_out = pd.DataFrame(tweet_process, columns=['Date','Name','Tweet Processed','Final_text','Language'])
    
    return df_out

In [5]:
# Process tweets
df_tweet_processed = process_tweets_text(df_twitter_data)

# Create a main dataframe with all information
Col = ['Name','Party','MP','Color','Twitter_id','Følgere','Tweet Count','y']
df_main = pd.merge(df_Name_id[Col],df_tweet_processed,how='left',on='Name')
df_main.head(5)

Unnamed: 0,Name,Party,MP,Color,Twitter_id,Følgere,Tweet Count,y,Date,Tweet Processed,Final_text,Language
0,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-23 14:20:48,"[grinet, flere, dage, så, får, passer, desværr...",grinet flere dage så får passer desværre midt ...,da
1,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-12 17:31:49,"[uefa, meddeler, eriksen, stabiliseret, hurra,...",uefa meddeler eriksen stabiliseret hurra #em2021,no
2,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-12 17:26:59,"[sandt, #em2020]",sandt #em2020,da
3,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-09 08:36:50,"[svært, åbenhjertig, ovor, så, skøn, samtalepa...",svært åbenhjertig ovor så skøn samtalepartner ...,da
4,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-06 12:18:23,[tillykke],tillykke,da


In [166]:
df_test = df_main.copy()
df_test = df_test[df_test['Language'] != 'en'].reset_index(drop=True)

df_test['Date'] = pd.to_datetime(df_test['Date'],format = '%Y-%m-%d %H:%M:%S')
df_test = df_test.loc[df_test['Date'] >= '2017-01-01'].reset_index(drop=True)

df_test = df_test.loc[df_test['Tweet Processed'].str.len() >= 2].reset_index(drop=True)

print('We have ' + str(np.round(sum(df_test['y'])/len(df_test),3)*100) + '% red in our dataframe')
df_test

We have 41.099999999999994% red in our dataframe


Unnamed: 0,Name,Party,MP,Color,Twitter_id,Følgere,Tweet Count,y,Date,Tweet Processed,Final_text,Language
0,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-23 14:20:48,"[grinet, flere, dage, så, får, passer, desværr...",grinet flere dage så får passer desværre midt ...,da
1,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-12 17:31:49,"[uefa, meddeler, eriksen, stabiliseret, hurra,...",uefa meddeler eriksen stabiliseret hurra #em2021,no
2,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-12 17:26:59,"[sandt, #em2020]",sandt #em2020,da
3,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-09 08:36:50,"[svært, åbenhjertig, ovor, så, skøn, samtalepa...",svært åbenhjertig ovor så skøn samtalepartner ...,da
4,Ida Auken,Socialdemokratiet (S),Medlem af Folketinget,Red,@IdaAuken,73.926,879,0,2021-06-03 08:24:02,"[skrevet, lidt, område, virkelig, halter, bagu...",skrevet lidt område virkelig halter bagud fht ...,da
...,...,...,...,...,...,...,...,...,...,...,...,...
71975,Orla Østerby,Uden for folketingsgrupperne (UFG),Medlem af Folketinget,Blue,@orlaosterby,596,51,1,2017-05-08 05:03:36,"[skræmmende, undersøgelse, fortjener, ingen, b...",skræmmende undersøgelse fortjener ingen børn #...,da
71976,Orla Østerby,Uden for folketingsgrupperne (UFG),Medlem af Folketinget,Blue,@orlaosterby,596,51,1,2017-05-04 08:17:37,"[morgensang, samtaleværelset, netop, dag, 04, ...",morgensang samtaleværelset netop dag 04 maj sa...,da
71977,Orla Østerby,Uden for folketingsgrupperne (UFG),Medlem af Folketinget,Blue,@orlaosterby,596,51,1,2017-05-02 15:36:22,"[godt, nyt, skattesatsen, sænkes, 1, procent, ...",godt nyt skattesatsen sænkes 1 procent 055 pro...,da
71978,Orla Østerby,Uden for folketingsgrupperne (UFG),Medlem af Folketinget,Blue,@orlaosterby,596,51,1,2017-04-26 05:58:20,"[dagtilbudslederen, overveje, hvordan, kan, in...",dagtilbudslederen overveje hvordan kan inddrag...,da


In [216]:
# Create a dataframe with one sting of tweets 

def get_new_df(df_main):
    
    names = df_main['Name'].unique().tolist()

    final_text_long = []

    for name in names:
        list_of_tweets = df_main['Final_text'][df_main['Name'] == name]
        text = ' '.join(list_of_tweets)
        final_text_long.append(text)

    # data = ([final_text_long,names])

    df_names = pd.DataFrame(names, columns=['Name'])
    df_tweets_long = pd.DataFrame(final_text_long, columns=['Final_text_long'])

    df_out = pd.concat([df_names, df_tweets_long], axis=1)

    return df_out


In [218]:
df_new_smart = get_new_df(df_test)
df_new_smart

df_y = df_test.drop_duplicates('Name')
df_y

# pd.unique(df_test[['Name']])

df_long_tweets = pd.merge(df_new_smart, df_y[['Name', 'y', ]], how='left', on=['Name'])
df_long_tweets

Unnamed: 0,Name,Final_text_long,y
0,Ida Auken,grinet flere dage så får passer desværre midt ...,0
1,Trine Bramsen,dybeste respekt ære minde godt byde guldholdet...,0
2,Bjørn Brandenborg,kæmpe tillykke 93388 søgt optagelse videregåen...,0
3,Jeppe Bruus,jamen fremstår mere mere sympatisk professione...,0
4,Morten Bødskov,går rigtige vej nye ejendomsvurderinger vej fo...,0
...,...,...,...
150,Uffe Elbæk,kan forstå silberg_mads skriver ser søren pape...,0
151,Lars Løkke Rasmussen,vigtig indspark debatten år år efterlader børn...,1
152,Sikandar Siddique,må simpelthen få konsekvenser to ministre henh...,0
153,Susanne Zimmer,dårlige klimaforandringerne ekskalerer gode ka...,0


In [204]:
a = df_test['y'][df_test['Name']=='Ida Auken']



0      0
1      0
2      0
3      0
4      0
      ..
707    0
708    0
709    0
710    0
711    0
Name: y, Length: 712, dtype: int64

In [202]:
# Create a dataframe with one sting of tweets 

def get_new_df(df_main):
    
    names = df_main['Name'].unique().tolist()

    Info_list = []

    for name in names:
        list_of_tweets = df_main['Final_text'][df_main['Name'] == name]
        text = ' '.join(list_of_tweets)
        y = df_main['y'][df_main['Name'] == name]

        Info_list.append([name,text,y])

    df_out = pd.DataFrame(Info_list, columns=(['Name','text','y']))

    return df_out

In [198]:
df_new_smart = get_new_df(df_test)
df_new_smart



Unnamed: 0,Name,text,y
0,Ida Auken,grinet flere dage så får passer desværre midt ...,0 0 1 0 2 0 3 0 4 0  ...
1,Trine Bramsen,dybeste respekt ære minde godt byde guldholdet...,712 0 713 0 714 0 715 0 716 ...
2,Bjørn Brandenborg,kæmpe tillykke 93388 søgt optagelse videregåen...,1455 0 1456 0 1457 0 1458 0 1459 ...
3,Jeppe Bruus,jamen fremstår mere mere sympatisk professione...,1741 0 1742 0 1743 0 1744 0 1745 ...
4,Morten Bødskov,går rigtige vej nye ejendomsvurderinger vej fo...,2040 0 2041 0 2042 0 2043 0 2044 ...
...,...,...,...
150,Uffe Elbæk,kan forstå silberg_mads skriver ser søren pape...,68266 0 68267 0 68268 0 68269 0 68...
151,Lars Løkke Rasmussen,vigtig indspark debatten år år efterlader børn...,68705 1 68706 1 68707 1 68708 1 68...
152,Sikandar Siddique,må simpelthen få konsekvenser to ministre henh...,69560 0 69561 0 69562 0 69563 0 69...
153,Susanne Zimmer,dårlige klimaforandringerne ekskalerer gode ka...,70835 0 70836 0 70837 0 70838 0 70...


In [185]:
df_y = df_test.unique()

pd.unique(df_test[['Name']])

# df_long_tweets = pd.merge(df_new_smart, df_test[['Name', 'y', ]], how='left', on=['Name'])
# df_long_tweets

AttributeError: 'DataFrame' object has no attribute 'unique'

In [158]:
test = df_main.copy()
print(test['Final_text'][0])
# test = test[:5]
name = df_main['Name'][0]


b = test['Final_text'][test['Name'] == name]
text = ' '.join(b)
print(b)
text
# test

grinet flere dage så får passer desværre midt godt moderne menneskes natursyn
0      grinet flere dage så får passer desværre midt ...
1       uefa meddeler eriksen stabiliseret hurra #em2021
2                                          sandt #em2020
3      svært åbenhjertig ovor så skøn samtalepartner ...
4                                               tillykke
                             ...                        
874                                       emilie stjerne
875                                             gæt sang
876    kører frugter klar svømmetur #metalskolen radi...
877    forstår desperationen løsningen ødelægge friog...
878                         resultatet fantastisk anyway
Name: Final_text, Length: 879, dtype: object




In [265]:
# LONG TWEETS
X, y = df_long_tweets['Final_text_long'], df_long_tweets['y']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

vect = CountVectorizer(ngram_range=(1,1), max_features = 20)

X_train = vect.fit_transform(X_train)
X_test = vect.fit_transform(X_test)
print("We have " + str(np.round(sum(y_test)/len(y_test),2)*100) + '% of red in our test')

We have 43.0% of red in our test


In [259]:

X, y = df_test['Final_text'], df_test['y']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

vect = CountVectorizer(ngram_range=(1,1), max_features = 1000)

X_train = vect.fit_transform(X_train)
X_test = vect.fit_transform(X_test)
print("We have " + str(np.round(sum(y_test)/len(y_test),2)*100) + '% of red in our test')


We have 41.0% of red in our test


In [266]:

model = LogisticRegression(max_iter = 10000, solver = 'saga', penalty ='l1', fit_intercept=True ).fit(X_train,y_train)

# probas = model.predict_proba(X)


In [267]:
probas = model.predict(X_test)


In [268]:
print(probas.sum())
print(y_test.sum())
a = probas == y_test
# print(np.round(probas.sum()/y_test.sum(),2)*100)
a.mean()

39
20


0.3829787234042553

In [269]:
beta = model.coef_
word = vect.get_feature_names()
print(word[0])
print(beta[0][0])
est = beta[0].tolist()
est = est
estimate = pd.DataFrame(word, columns=['Word'])
estimate['Estimate'] = est
estiamte = estimate.sort_values('Estimate')

estiamte


børn
0.007555016380884186


Unnamed: 0,Word,Estimate
2,danmark,-0.060098
5,flere,-0.050866
19,år,-0.049056
10,helt,-0.023979
7,får,-0.02235
14,regeringen,-0.018621
16,tak,-0.013498
8,god,-0.010247
12,mere,-0.008854
3,dkgreen,-0.000475
