In [1]:
import pandas as pd
import datetime
import re


In [2]:
tweet_data = pd.read_csv("../data/combined_DE_politician_twitter_timelines_2016-01-01_to_2023-02-11_clean.csv")
tweet_data.created_at = pd.to_datetime(tweet_data.created_at)


In [3]:
print("Number of tweets: ", len(tweet_data))

Number of tweets:  1559359


In [4]:
# get data of the 19th legislative period

legislative_period_19_begin = datetime.date(2017, 10, 24)
legislative_period_19_end = datetime.date(2021, 10, 26)


filtered_tweet_data = tweet_data[tweet_data.created_at.dt.date > legislative_period_19_begin]
filtered_tweet_data = filtered_tweet_data[filtered_tweet_data.created_at.dt.date < legislative_period_19_end]
filtered_tweet_data.info()
filtered_tweet_data.head()
# size
print("Number of tweets in the 19th legislative period: ", len(filtered_tweet_data))


<class 'pandas.core.frame.DataFrame'>
Index: 708874 entries, 189 to 1549658
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   id             708874 non-null  int64              
 1   author_id      708874 non-null  int64              
 2   created_at     708874 non-null  datetime64[ns, UTC]
 3   expanded_urls  708874 non-null  object             
 4   retweeted      708874 non-null  bool               
 5   quoted         708874 non-null  bool               
 6   reply          708874 non-null  bool               
 7   text           708874 non-null  object             
 8   retweet_count  708874 non-null  int64              
 9   reply_count    708874 non-null  int64              
 10  like_count     708874 non-null  int64              
 11  quote_count    708874 non-null  int64              
dtypes: bool(3), datetime64[ns, UTC](1), int64(6), object(2)
memory usage: 56.1+ MB
Number of

In [5]:

def clean_text(text):
    # Replace @mentions with 'user'
    text = re.sub(r'@[A-Za-z0-9]+', 'user', text) 
    # Remove '#' hash tag
    text = re.sub(r'#', '', text) 
    # Remove RT
    text = re.sub(r'RT[\s]+', '', text) 
    # Remove hyperlink
    text = re.sub(r'https?:\/\/\S+', '', text) 
    # Remove newline
    text = re.sub(r'\n', ' ', text) 
    # Remove newline
    text = re.sub(r'\\n', ' ', text) 
    # remove all sperators
    text = re.sub(r'\W', ' ', text)
    # remove all sequences of whitespace characters
    text = re.sub(r'\s+', ' ', text)

    # remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F700-\U0001F77F"  
                           u"\U0001F780-\U0001F7FF"  
                           u"\U0001F800-\U0001F8FF"  
                           u"\U0001F900-\U0001F9FF"  
                           u"\U0001FA00-\U0001FA6F"  
                           u"\U0001FA70-\U0001FAFF"  
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # change multiple spaces to one space
    text = re.sub(r'\s+', ' ', text)

    # remmove whitespace \u200d (zero width joiner)
    text = re.sub(r'\u200d', '', text, flags=re.UNICODE)

    text = text.replace('\u200d', '')

    return text


print("Shape of data before cleaning " , filtered_tweet_data.shape)


# apply the function to the 'text' column
filtered_tweet_data['text'] = filtered_tweet_data['text'].apply(clean_text)


# remove all tweets with less than 3 words
filtered_tweet_data['num_words'] = filtered_tweet_data['text'].apply(lambda x: len(x.split()))
filtered_tweet_data = filtered_tweet_data[filtered_tweet_data['num_words'] >= 3]
# remove the column 'num_words'
filtered_tweet_data = filtered_tweet_data.drop('num_words', axis=1)



# save the cleaned data
filtered_tweet_data.to_csv("../data/filtered_tweet_data.csv")

print("Shape of data after cleaning " , filtered_tweet_data.shape)

Shape of data before cleaning  (708874, 12)
Shape of data after cleaning  (666567, 12)


In [6]:
# mean and sd of the number of characters in the tweets
filtered_tweet_data['num_characters'] = filtered_tweet_data['text'].apply(lambda x: len(x))
print("Mean number of characters in the tweets: ", filtered_tweet_data['num_characters'].mean())
print("Standard deviation of the number of characters in the tweets: ", filtered_tweet_data['num_characters'].std())

Mean number of characters in the tweets:  151.5876993610545
Standard deviation of the number of characters in the tweets:  83.81611770403673
