In [16]:
import pandas as pd
import datetime
import re


In [17]:
tweet_data = pd.read_csv("../data/combined_DE_politician_twitter_timelines_2016-01-01_to_2023-02-11_clean.csv")
tweet_data.created_at = pd.to_datetime(tweet_data.created_at)

In [18]:
# get data of the 19th legislative period

legislative_period_19_begin = datetime.date(2017, 10, 24)
legislative_period_19_end = datetime.date(2021, 10, 26)


filtered_tweet_data = tweet_data[tweet_data.created_at.dt.date > legislative_period_19_begin]
filtered_tweet_data = filtered_tweet_data[filtered_tweet_data.created_at.dt.date < legislative_period_19_end]
filtered_tweet_data.info()
filtered_tweet_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 708874 entries, 189 to 1549658
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   id             708874 non-null  int64              
 1   author_id      708874 non-null  int64              
 2   created_at     708874 non-null  datetime64[ns, UTC]
 3   expanded_urls  708874 non-null  object             
 4   retweeted      708874 non-null  bool               
 5   quoted         708874 non-null  bool               
 6   reply          708874 non-null  bool               
 7   text           708874 non-null  object             
 8   retweet_count  708874 non-null  int64              
 9   reply_count    708874 non-null  int64              
 10  like_count     708874 non-null  int64              
 11  quote_count    708874 non-null  int64              
dtypes: bool(3), datetime64[ns, UTC](1), int64(6), object(2)
memory usage: 56.1+ MB


Unnamed: 0,id,author_id,created_at,expanded_urls,retweeted,quoted,reply,text,retweet_count,reply_count,like_count,quote_count
189,1439668681282408451,1419751204465356805,2021-09-19 19:12:16+00:00,[],False,False,False,"Hey @ABaerbock, echt stark. Klar vorn. Klar üb...",1,1,9,0
190,1437507087462977538,1419751204465356805,2021-09-13 20:02:51+00:00,['https://twitter.com/FBsirske/status/14375070...,False,False,False,#Triell\nDa ist noch was drin! TB https://t.co...,0,0,3,0
191,1437151806669434887,1419751204465356805,2021-09-12 20:31:06+00:00,[],False,False,False,"Mit @ABaerbock weiter angreifen, Klimaschutz u...",1,1,6,0
192,1437142774860730371,1419751204465356805,2021-09-12 19:55:13+00:00,[],False,False,False,#Triell \nWer war gut? #Baerbock im plus bei G...,0,0,2,0
193,1437141923005874179,1419751204465356805,2021-09-12 19:51:49+00:00,[],False,False,False,"Eine Gesprächsführung, die #Baerbock und #Scho...",0,1,2,0


In [19]:

def clean_text(text):
    # Replace @mentions with 'user'
    text = re.sub(r'@[A-Za-z0-9]+', 'user', text) 
    # Remove '#' hash tag
    text = re.sub(r'#', '', text) 
    # Remove RT
    text = re.sub(r'RT[\s]+', '', text) 
    # Remove hyperlink
    text = re.sub(r'https?:\/\/\S+', '', text) 
    # Remove newline
    text = re.sub(r'\n', ' ', text) 
    # Remove newline
    text = re.sub(r'\\n', ' ', text) 
    # remove all sperators
    text = re.sub(r'\W', ' ', text)
    # remove all sequences of whitespace characters
    text = re.sub(r'\s+', ' ', text)

    # remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F700-\U0001F77F"  
                           u"\U0001F780-\U0001F7FF"  
                           u"\U0001F800-\U0001F8FF"  
                           u"\U0001F900-\U0001F9FF"  
                           u"\U0001FA00-\U0001FA6F"  
                           u"\U0001FA70-\U0001FAFF"  
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # change multiple spaces to one space
    text = re.sub(r'\s+', ' ', text)
    return text


print("Shape of data before cleaning " , filtered_tweet_data.shape)


# apply the function to the 'text' column
filtered_tweet_data['text'] = filtered_tweet_data['text'].apply(clean_text)

# remove tweets which have fewer or equal to 2 words
filtered_tweet_data['text'] = filtered_tweet_data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# remove tweets that are empty after cleaning
filtered_tweet_data = filtered_tweet_data[filtered_tweet_data['text'] != '']
filtered_tweet_data = filtered_tweet_data[filtered_tweet_data['text'] != ' ']

# save the cleaned data
filtered_tweet_data.to_csv("../data/filtered_tweet_data.csv")

print("Shape of data after cleaning " , filtered_tweet_data.shape)

Shape of data before cleaning  (708874, 12)
Shape of data after cleaning  (700660, 12)
