# Data Processing of tweets

In [1]:
#import relevant packages 

import pandas as pd
import numpy as np

from datetime import datetime #To check start and end time when running code
from tqdm import tqdm #This is for creating progress bars.
import logging #This is to provide logging of information when running the LDA
import sys #This is to disable logging when it's no longer needed
import pickle #To store and open previously saved machine learning models 

#import nlkt libaries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer # Porter is used below. This is an alternative, harsher stemmer. 
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

#language detection libaries
from langdetect import detect
import fasttext

#Importing packages for data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

#import packages for regular expressions 
import regex
import re

#Importing NLTK and NLP packages
import nltk
from nltk.tokenize import TweetTokenizer
import string
from collections import defaultdict

In [2]:
#read in data - here we have scraped data in two iterations 
df_12 = pd.read_excel('tweets_12_actors_15maj.xlsx', index_col=0)
df_new = pd.read_excel('tweets_new_actors_18maj.xlsx', index_col=0)
df_new_12_new = pd.read_excel('tweets_12_actors_15-26maj.xlsx', index_col=0)
df_new_1 = pd.read_excel('tweets_new_actors_19-26maj.xlsx', index_col=0)
df_4 = pd.read_excel('tweets_4_new_actors_.xlsx', index_col=0)


In [3]:
#creating one data set 
data = pd.concat([df_12, df_new, df_new_12_new, 
                  df_new_1, df_4], join='inner', ignore_index=True)
data = data.reset_index(drop=True)

In [4]:
print("Total dataset:", data.shape)


Total dataset: (12176, 5)


In [5]:
# we found out that @Feministisk_DK is a political party
data = data[data.actor != '@Feministisk_DK']
print("Total dataset:", data.shape)
data.head()

Total dataset: (12139, 5)


Unnamed: 0,actor,tweet,date,retweet,date_convert
0,@PlanBornefonden,"13-årige Larissa bor i Sahel-regionen, og var ...",2021-05-14 09:03:00,,2021-05-14
1,@PlanBornefonden,Vi ønsker alle muslimer en god Eid i aften! Ei...,2021-05-12 14:00:02,,2021-05-12
2,@PlanBornefonden,Kom til samtalekøkken med @BosseStine og @Clau...,2021-05-12 11:58:03,RT @dorthe10:,2021-05-12
3,@PlanBornefonden,"Mali, Burkina Faso og Niger - også kendt som d...",2021-05-12 10:00:02,,2021-05-12
4,@PlanBornefonden,Vores seje kollega Iben Østergaard Markussen f...,2021-05-12 09:23:14,RT @dorthe10:,2021-05-12


## Extracting #hashtags, @mentions and emojis 

In [6]:
#make sure all tweets are strings
type(data.tweet[0])
data['tweet'] = data['tweet'].apply(str)

In [7]:
#Saving @ mentions in another column 
mentions = []
for index, s in data.tweet.iteritems():
    results = []
    if '@' in s:
        result = re.findall("(?<![@\w])@(\w{1,25})", s)
        results.append(', '.join(result))
    else:
        results = None
    mentions.append(results)

data['@mentions'] = mentions

In [8]:
# Saving list of #hashtags in another column before cleaning text
hashtags = []
for index, s in data.tweet.iteritems():
    results = []
    if '#' in s:
        result = re.findall("(?<![@\w])#(\w{1,25})", s)
        # make for loop saving the # 
        #hashtags.append(result)
        results.append(' '.join(result))
    else:
        results = ''
    hashtags.append(results)

data['#hashtags'] = hashtags



In [9]:
hashtags_1 = []
for row in hashtags:
    item = str(row)
    item = re.sub('\[', ' ', item) # only keeping letters
    item = re.sub('\]', ' ', item) # only keeping letters
    item = re.sub('\'', ' ', item) # only keeping letters
    item = re.sub(r'\s+', " ", item) #remove more whitespaces
    hashtags_1.append(item)
    
data['#hashtags'] = hashtags_1

In [10]:
# Saving list of emojis (🙄🤔') in another column before cleaning text
emojis = []

#We extract all emojis but also all characters (which we remove below)
for index, s in data.tweet.iteritems():
        result = re.findall(r'[^\w\s,]', s)
        emojis.append(', '.join(result))

data['emojis'] = emojis

In [11]:
#we need to remove all characters from 
cleaned = []

for characters in data['emojis']:
    #item = re.sub(r'@\S+', "", text) #removing @mentions
    item = regex.sub(r'\p{PUNCTUATION}', "", characters) #remove punctaion
    item = re.sub(r'\s+', " ", item) #remove more whitespaces
    cleaned.append(item)
    
data['emojis'] = cleaned

In [12]:
data.emojis[16]

' 👏 '

In [13]:
#inspecting new agregated dataset 
data.head(3)

Unnamed: 0,actor,tweet,date,retweet,date_convert,@mentions,#hashtags,emojis
0,@PlanBornefonden,"13-årige Larissa bor i Sahel-regionen, og var ...",2021-05-14 09:03:00,,2021-05-14,,,
1,@PlanBornefonden,Vi ønsker alle muslimer en god Eid i aften! Ei...,2021-05-12 14:00:02,,2021-05-12,,,
2,@PlanBornefonden,Kom til samtalekøkken med @BosseStine og @Clau...,2021-05-12 11:58:03,RT @dorthe10:,2021-05-12,"[BosseStine, ClausMeyerDK]",dkfood,


## Pre-processing of tweets and retweets

In [14]:
#Define remove emojis function 
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)

def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)


In [15]:
#Define another remove emojis function 
RE_EMOJI2 =  re.compile(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+', 
    re.UNICODE)

def strip_emoji_2(text):
    return RE_EMOJI2.sub(r'', text)


In [16]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [17]:
#remove @ from actor column
data['actor'] = data.actor.str.replace(r'@','')

In [18]:
#cleaning tweets using regex

cleaned = []

for text in data['tweet']:
    item = re.sub(r'@\S+', "", text) #removing @mentions
    item = re.sub(r'#(\w+)', "", item) #removing hashtags 
    item = re.sub(r'&amp;', "", item) #remove &
    
    #remove emojis and lower case 
    item = re.sub(r'1⃣', "", item)
    item = re.sub(r'2⃣', "", item)
    item = re.sub(r'3⃣', "", item)
    item = re.sub(r'4⃣', "", item)
    item = re.sub(r'2️⃣9️⃣', "", item)
    item = strip_emoji(item) #removing emojis 
    item = strip_emoji(item) #removing emojis 
    item = strip_emoji_2(item) #removing emojis 
    item = remove_emojis(item)
    item = re.sub(r"[┻┃━┳┓┏┛┗]","", item)
    item = re.sub(r"\u202F|\u2069|\u200d|\u2066","", item)
    
    
    item = re.sub(r'\d+', "", item) #remove digits first, otherwise we get eg. 13årige
    item = re.sub(r'(^[a-zA-Z]+$)', '', item) # only keeping letters 
    item = item.lower() #lower cases 
    
    item = regex.sub(r'\p{PUNCTUATION}', "", item) #remove punctaion
    item = re.sub(r'\s+', " ", item) #remove more whitespaces
    item = re.sub(r'http\S+', '', item) #remove urls
    item = item.rstrip()
  
    cleaned.append(item)
    
data['clean_text'] = cleaned

In [19]:
string_text = []
for index, row in data.iterrows():
    
    if row.actor == 'MissionEast':
        string = re.sub(r'mission øst', '', row.clean_text)
        string_text.append(string)
    else:
        string_text.append(row.clean_text)
    #item = re.sub(r'mission øst', '', words

data['clean_text'] = string_text


In [20]:
print(data.tweet[23])
data.clean_text[23]

Det er rystende at høre om den ulykkelige situation i Indien. Med bidrag til @IndianRedCross er Danmark bl.a. med til at støtte ambulanceservice, indkøb af beskyttelsesudstyr samt stoppe misinformation om sygdommen. Helt afgørende indsatser for at bekæmpe pandemien. #dkpol #dkaid


'det er rystende at høre om den ulykkelige situation i indien med bidrag til er danmark bla med til at støtte ambulanceservice indkøb af beskyttelsesudstyr samt stoppe misinformation om sygdommen helt afgørende indsatser for at bekæmpe pandemien'

* We are keeping names, as these might refer to important actors, which we want to explore

## Language Detector 

In [21]:
import fasttext
#PRETRAINED_MODEL_PATH 
fast = fasttext.load_model('/Users/Sofie/Desktop/tmp/lid.176.bin')



In [22]:
# Functions for language classification
 
def tweet_cleaner(x):
    '''
    Cleans a str object x by:
    replacing '\n' with ' '
    replacing '#' with ''
    removing urls
    '''
    x = x.replace('\n', ' ')
    x = x.replace('#', '')
    x = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', x)
    x = x.strip()
    return x
 
# fasttext function - language detector
def fast_detector(x):
    x = tweet_cleaner(x)
    try:
        return fast.predict(x)[0][0][-2:]
    except:
        pass


In [23]:
#Checking how the language detector works
print('Language:', fast_detector(data.tweet[117]))

Language: da


In [24]:
#adding language column 
data['language'] = data["tweet"].apply(lambda x: fast_detector(x))

In [25]:
#expecting language 

#most tweets with other lang than da or en are miscategorization due to emojis and links 
data.groupby(data.language).count()

Unnamed: 0_level_0,actor,tweet,date,retweet,date_convert,@mentions,#hashtags,emojis,clean_text
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
af,1,1,1,0,1,1,1,1,1
ar,2,2,2,2,2,0,2,2,2
az,1,1,1,0,1,1,1,1,1
bg,1,1,1,0,1,0,1,1,1
cs,3,3,3,0,3,3,3,3,3
da,9201,9201,9201,3330,9201,5781,9201,9201,9201
de,23,23,23,8,23,17,23,23,23
ds,1,1,1,1,1,0,1,1,1
en,2589,2589,2589,2044,2589,1534,2589,2589,2589
es,13,13,13,1,13,10,13,13,13


In [26]:
#inspection 
data.loc[data.language == 'ar']

Unnamed: 0,actor,tweet,date,retweet,date_convert,@mentions,#hashtags,emojis,clean_text,language
2579,ActionAidDK,شكرا لهولاء الناشطين المعتصمين أمام مبنى البر...,2021-03-17 15:04:40,RT @AAPalestine:,2021-03-17,,,,شكرا لهولاء الناشطين المعتصمين أمام مبنى البرل...,ar
2587,ActionAidDK,تم إعطاء لقاح فيروس كورونا للإسرائليين الذين ...,2021-03-16 09:53:27,RT @AAPalestine:,2021-03-16,,معا_للمطالبة_بلقاح_مجاني_,,تم إعطاء لقاح فيروس كورونا للإسرائليين الذين ي...,ar


In [27]:
#to english from de 
english_tweets_fail_detection = [2174, 2749]

data.language[english_tweets_fail_detection[0]] = 'en'
data.language[english_tweets_fail_detection[1]] = 'en'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.language[english_tweets_fail_detection[0]] = 'en'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.language[english_tweets_fail_detection[1]] = 'en'


In [28]:
# remove tweets if language is not Danish or English 

#where language detector is not consistent (ES, SE, DE)
tweets_remove_id = [2588, 8456, 8447, 8422, 7271, 4614, 3383, 8570, 8574, 11998] 
remove_language = ['ar', 'fr', 'ht', 'pt'] #no text in ht 

#We save these in objects now, so we can remove them after the retweet networks
#This is a methodological choice as the retweets are essential to understand the 
#relations betweens actors through retweets, but as the text can not be used for 
#PCA or other models relying on text data 


### Methodological choices regarding languages of twwets

* We sort out non-Danish tweet to increase the validity. In this way, we loose some data, but importantly, we are able to conclude on the findings setting them in relation to a national context, which we can investigate qualitatively. 
* The language detector is not good enough to detect Danish and Norwegian tweets from oneanother. Most tweets detected as Norwegians tweets are actually written in Danish, and the rest uses a lot of the same words. Furhter, are there some tweets in Norwegian, the words will proberbly not be used in the PCA anyways as they appear to seldom. Therefore, we keep all norwegian tweets.


## Stopwords and Lemmatizer

In [30]:
#Stopwords 

stop_words_dk = set(stopwords.words('danish')) 
stop_words_en = set(stopwords.words('english')) 

# we add more stop words 
danish_words = ['kan', 'så', 'få', 'se', 'ved', 'ser', 'hvordan', 'mere', 'nye', 'derfor', 
                'får', 'gøre ', 'går', 'bla', 'mest', 'gør', 'stor', 'del', 'nå', 'både', 
                'tæt', 'andre', 'bruge', 'dag', 'sige', 'vores', 'komme', 'siger', 'sagde',
               'ny', 'mellem', 'omkring', 'pga', 'fordi', 'gå', 'bare' , 'lidt', 'sætte', 
                'of', 'on', 'the']



print("length of orignial stopword list:", len(stop_words_dk))

for word in danish_words:
    stop_words_dk.add(word)

print("length of new stopword list:", len(stop_words_dk))

length of orignial stopword list: 94
length of new stopword list: 135


In [31]:
#remove stopwords

#TOKENNIZE WORDS
data["words"] = data["clean_text"].str.split()

#REMOVE ENGLISH STOPWORDS IF LANGUAGE IS ENGLISH, ELSE REMOVE DANISH STOPWORDS
data["without_stopwords"] = [[x for x in row.words if x not in 
                              (stop_words_en if row.language == 'en' else stop_words_dk)]
                             for row in data.iloc]

In [32]:
#lemmatize danish tweets 
import lemmy

# Create an instance of the standalone lemmatizer.
lemmatizer = lemmy.load("da")

In [33]:
# creating a column with lemmas  (Danish)
lemmas = []

for words in data['without_stopwords']:
    lemmas_sentence = []
    for word in words:
        lemma = lemmatizer.lemmatize("", word)
        lemmas_sentence.append(lemma[0])
    lemmas.append(lemmas_sentence)
        
        
data['lemmas'] = lemmas

In [34]:
#putting words together 
data['proc_text'] = data['lemmas'].apply(
                     lambda x: " ".join( x))

## Bigrams 

In [35]:
#We have the unigrams in the lemmas 

tqdm.pandas() #Creates a progress bar. Use progress_apply instead of apply.
#Defining a function that will create bigrams 
def bigrams(doc):
    
    bigrams = [] #Empty list to save the bigrams
    
    for bigram in list(nltk.bigrams(doc)):  #Creating bigrams and iterating over them
        bigrams.append("_".join(bigram))    #Connecting each bigram pair with an underscore and saving to list
    
    return bigrams

#Creating a column with bigrams
data['bigrams'] = data.lemmas.progress_apply(lambda x: bigrams(x))


100%|██████████| 12139/12139 [00:00<00:00, 43588.81it/s]


In [36]:
# creating final data string for the words we want to include in the analyses

data['proc_text_all'] = [row.proc_text + ' ' + ' '.join(row.bigrams) + row['#hashtags'] 
                            for row in data.iloc] #includes, tokens, bigrams and hashtags

## Pre-processesing retweets

In [37]:
#make all retweets to strings
data['retweet'] = data['retweet'].apply(str)

In [38]:
#removing RT from retweets
retweets = []
for retweet in data['retweet']:
    if retweet is 'nan':
        item = None
    else:
        item = re.sub(r'^RT ', '', retweet)
        item = re.sub(r': ', '', item)
    retweets.append(item)
    

data['retweet'] = retweets


  if retweet is 'nan':


### Making sub-dataset only for retweets network

In [39]:
#creating subset of data
data_retweets = data.loc[data['retweet'] != 'nan']
data_retweets = data_retweets.reset_index(drop=True)

print(data_retweets.shape)
data_retweets.head(3)

(5465, 16)


Unnamed: 0,actor,tweet,date,retweet,date_convert,@mentions,#hashtags,emojis,clean_text,language,words,without_stopwords,lemmas,proc_text,bigrams,proc_text_all
0,PlanBornefonden,Kom til samtalekøkken med @BosseStine og @Clau...,2021-05-12 11:58:03,@dorthe10,2021-05-12,"[BosseStine, ClausMeyerDK]",dkfood,,kom til samtalekøkken med og den maj og spis e...,da,"[kom, til, samtalekøkken, med, og, den, maj, o...","[kom, samtalekøkken, maj, spis, lækker, retter...","[komme, samtalekøkken, maj, spise, lækker, ret...",komme samtalekøkken maj spise lækker ret menu ...,"[komme_samtalekøkken, samtalekøkken_maj, maj_s...",komme samtalekøkken maj spise lækker ret menu ...
1,PlanBornefonden,Vores seje kollega Iben Østergaard Markussen f...,2021-05-12 09:23:14,@dorthe10,2021-05-12,"[radioloud_dk, MaternityF]",,,vores seje kollega iben østergaard markussen f...,da,"[vores, seje, kollega, iben, østergaard, marku...","[seje, kollega, iben, østergaard, markussen, f...","[sej, kollega, ibe, østergaard, markusse, fort...",sej kollega ibe østergaard markusse fortæller ...,"[sej_kollega, kollega_ibe, ibe_østergaard, øst...",sej kollega ibe østergaard markusse fortæller ...
2,PlanBornefonden,Godt at se @udviklingsmin og @JeppeKofod under...,2021-05-12 07:01:44,@anne_smith_p,2021-05-12,"[udviklingsmin, JeppeKofod, DanishMFA]",dkaid PURE,,godt at se og understrege at den danske indsat...,da,"[godt, at, se, og, understrege, at, den, dansk...","[godt, understrege, danske, indsats, sahel, fo...","[godt, understrege, dansk, indsats, sahele, fo...",godt understrege dansk indsats sahele fortsat ...,"[godt_understrege, understrege_dansk, dansk_in...",godt understrege dansk indsats sahele fortsat ...


In [40]:
#remove @
data_retweets['retweet'] = data_retweets.retweet.str.replace(r'@','')

data_retweets = data_retweets.drop(columns=['tweet', 'date', 'date_convert','#hashtags','clean_text','words','without_stopwords', 'proc_text', 'bigrams','lemmas', 'language', 'emojis', 'proc_text_all'])

In [41]:
data_retweets.head()

Unnamed: 0,actor,retweet,@mentions
0,PlanBornefonden,dorthe10,"[BosseStine, ClausMeyerDK]"
1,PlanBornefonden,dorthe10,"[radioloud_dk, MaternityF]"
2,PlanBornefonden,anne_smith_p,"[udviklingsmin, JeppeKofod, DanishMFA]"
3,PlanBornefonden,SeeRap,
4,PlanBornefonden,UNFPAEthiopia,[UNFPAEthiopia]


In [42]:
#saving to desktop 
data_retweets.to_csv('data_retweets.csv', index=False)

## Remove tweets with other languages for text analysis

* Remove **tweets_remove_id** = [2588, 8456, 8447, 8422, 7271, 4614, 3383, 8570, 8574] 
* Remove **remove_language** = ['ar', 'fr', 'ht', 'pt'] #no text in ht 

* Divide up Danish and English tweets 


In [43]:
print('Shape before:', data.shape)
data_clean = data.drop(tweets_remove_id)
print('New shape after dropping first time:', data_clean.shape)
data_clean = data_clean[data_clean.language != 'ar']
print('New shape after dropping second time:', data_clean.shape)
data_clean = data_clean[data_clean.language != 'fr']
print('New shape after dropping second time:', data_clean.shape)
data_clean = data_clean[data_clean.language != 'ht']
print('New shape after dropping second time:', data_clean.shape)
data_clean = data_clean[data_clean.language != 'pt']
print('New shape after dropping second time:', data_clean.shape)
data_clean = data_clean[data_clean.proc_text != '']
print('New shape after dropping empty columns:', data_clean.shape)
data_clean = data_clean.reset_index(drop=True)

Shape before: (12139, 16)
New shape after dropping first time: (12129, 16)
New shape after dropping second time: (12127, 16)
New shape after dropping second time: (12118, 16)
New shape after dropping second time: (12114, 16)
New shape after dropping second time: (12112, 16)
New shape after dropping empty columns: (11957, 16)


In [44]:
#making danish data set 
data_danish = data_clean[data_clean.language != 'en']
data_danish = data_danish.reset_index(drop=True)
data_danish.shape

(9449, 16)

In [45]:
#data_danish_sub = data_danish[['actor', 'proc_text', '#hashtags']]
data_danish.to_csv('data_danish.csv', index=False)

In [46]:
#explore text where words are processed 
for text in data_danish.proc_text:
    if 'læsse projekt' in text:
        print(text)

gange program officielt begynde uganda etiopien sammen konsortium fem ngo støtte sikre læring gennem leg barn flugte læsse projekt
true lokalsamfund sundhed fødevaresikkerhed støtte lokal produktion udstyre ansigtsmaske sæbe redde liv samtidig øge befolkning indkomst læsse projekt
inden bygge dæmning kalobeyeu næsten umulig gro afgrøde problem snarere plads grøntsag læsse projekt spil vigtig rolle kris
vide fremme platform læsse projekt


**Note on Lemmatizer:**

It changes the danish word

* *læse* to *læsse*
* *tage* to *tagge*