# Data Collection Notebook


First, I loaded ```2021_politicians_data.csv```, a section of the Twitter Parliamentarian Database (Vliet et al., 2020), selected for UK MP's, and removed unnecessary columns.

In [8]:
import pandas as pd

df = pd.read_csv("data/mp_data/2021_politicians_data.csv")
df_uk = df[df['country'] == 'United Kingdom']

df_uk = df_uk.drop(columns=['pol_group_id', 'name_link', 'party_pol_group_id', 
                            'uid', 'mp_party_id', 'legislative_period_id', 
                            'country_id', 'date_of_inactivity', 'scraper_url',
                            'function', 'region', 'country', 'member_id', 'party_id', 'chamber'])
df_uk.head()

Unnamed: 0,name,party,constituency
540,diane abbott,Labour,Hackney North and Stoke Newington
541,debbie abrahams,Labour,Oldham East and Saddleworth
542,nigel adams,Conservative,Selby and Ainsty
543,bim afolami,Conservative,Hitchin and Harpenden
544,adam afriyie,Conservative,Windsor


Next, I mapped MP names to usernames from a csv I created containing MP names, constituencies and usernames.

In [10]:
username_name_df = pd.read_csv("data/mp_data/name_username_constituency.csv")

username_name_df['Name'] = username_name_df['Name'].apply(lambda x: str(x).lower())

username_name_df.drop(columns=['Constituency'], inplace=True)
username_name_df = username_name_df.rename(columns={'Screen_Name': 'Username'})
username_name_df.head() 

Unnamed: 0,Name,Username
0,aaron bell,@AaronBell4NUL
1,abena oppong-asare,@abenaopp
2,adam afriyie,@AdamAfriyie
3,afzal khan,@Afzal4Gorton
4,alan brown,@AlanBrownSNP


In [11]:
merged_df = pd.merge(df_uk, username_name_df, left_on='name', right_on='Name')
merged_df.drop(columns=['Name'], inplace=True)

merged_df.head()

Unnamed: 0,name,party,constituency,Username
0,diane abbott,Labour,Hackney North and Stoke Newington,@HackneyAbbott
1,debbie abrahams,Labour,Oldham East and Saddleworth,@Debbie_abrahams
2,nigel adams,Conservative,Selby and Ainsty,@nadams
3,bim afolami,Conservative,Hitchin and Harpenden,@BimAfolami
4,adam afriyie,Conservative,Windsor,@AdamAfriyie


I then used a web scraper, Nitter, to collect as many tweets as possible from each MP (denoted by ```number=-1```).

In [None]:
from ntscraper import Nitter

scraper = Nitter()
def scrape_tweets(username):
    try:
        # Use the scraper to get the tweets
        tweets = scraper.get_tweets(username[1:], mode='user', number=-1)
        # Process the tweets to extract the desired information
        # For example, return the count of tweets or the tweets themselves
        return tweets
    except Exception as e:
        # Handle exceptions, e.g., user not found or rate limits exceeded
        return None
    
merged_df['tweets'] = merged_df['Username'].apply(scrape_tweets)

merged_df.to_csv('scraped_dataset.csv', index=False)

Sometimes the scraper failed to find the username of an MP. To overcome this limitation, I iteratively ran the scraper over rows where data was not collected, until iterations no longer resulted in fewer gaps in the data.

In [None]:
merged_df = pd.read_csv('scraped_dataset.csv')

# Check for NaN or empty tweets
failed_scraping = merged_df['tweets'].isna() | (merged_df['tweets'] == "{'tweets': [], 'threads': []}")

previous_df_len = 100 
while True:
    failed_df = merged_df[failed_scraping]
    df_len = len(failed_df)
    if df_len == previous_df_len:
        break
    previous_df_len = df_len

    failed_df['tweets'] = failed_df['Username'].apply(scrape_tweets)

    merged_df.loc[failed_scraping, 'tweets'] = failed_df['tweets']
    merged_df = merged_df.drop(columns=['member_id', 'party_id', 'chamber'])

    merged_df.to_csv('data/scraper_results/scraped_dataset.csv', index=False)

Next, I separated individual tweets into separate rows.

In [None]:

columns = list(merged_df.columns) + ['text', 'date', 'is_retweet', 'stats']
rows_list = []

for index, row in merged_df.iterrows():
    mp_info = row.to_dict()
    
    tweets = dict(row['tweets']).get('tweets', []) if pd.notna(row['tweets']) else []
    
    for tweet in tweets:
        tweet_info = {
            'text': tweet['text'],
            'date': tweet['date'],
            'is_retweet': tweet['is-retweet'],
            'stats': tweet['stats']
        }
        
        new_row = {**mp_info, **tweet_info}
        
        rows_list.append(new_row)

politician_tweet_df = pd.DataFrame(rows_list, columns=columns)
politician_tweet_df = politician_tweet_df.drop(columns=["tweets"])

politician_tweet_df.head()

I processed each tweet to remove unwanted text like hashtags, hyperlinks and emojis, and droppped retweets from the dataframe. 

In [None]:
from supplementary_code import process_df_text

politician_tweet_df = process_df_text(politician_tweet_df, 'text') # Remove hashtags, hyperlinks, emojis.
politician_tweet_df = politician_tweet_df[politician_tweet_df['is_retweet'] == False] # Filter retweets.
politician_tweet_df = politician_tweet_df.dropna() # Remove missing values.
politician_tweet_df.reset_index()

print(len(politician_tweet_df))
politician_tweet_df.head()

Finally, I passed each tweet through each model to obtain emotion intensity scores.

In [None]:
# Took about 200 minutes to run on an M2 Macbook Air. On a CPU it will likely take significantly longer.

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
import tqdm

from transformers import AdamW, get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
device = torch.device("mps") # I used mps as I use an Apple device. Windows users should use Cuda if available, or the CPU if not.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_emotion_intensity(text, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        inputs = tokenizer(text, padding=True, truncation=True, max_length=240, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Get model predictions
        outputs = model(**inputs)
        preds = outputs.logits.squeeze(-1)

        preds = preds.cpu().numpy()
        
    return preds

emotions = ['anger', 'fear', 'sadness', 'joy']
optimum_model_number = ['2', '2', '3', '3']

for emotion, number in tqdm(zip(emotions, optimum_model_number)):
    state = torch.load(f'models/emotion_models/{emotion}/model_epoch_{number}.pt')
    model.load_state_dict(state)
    politician_tweet_df[emotion + '_intensity'] = politician_tweet_df['text'].apply(lambda x: predict_emotion_intensity(x, model, tokenizer, device))
    politician_tweet_df[emotion + '_intensity'] = politician_tweet_df[emotion + '_intensity'].apply(lambda x: x[0])

I saved the resulting model to ```cleaned_intensity_dataset.csv```.

In [None]:
politician_tweet_df.to_csv("cleaned_intensity_dataset.csv", index=False)