## GET TWITTER ACCOUNTS OF POLITICIANS


The provided code can be used for scraping data from Tweedekamer.nl, Wikipedia and Wikidata related to members of the Dutch Tweede Kamer (House of Representatives). It includes functions to extract information about current members of the Tweede Kamer, members from specific periods (2012-2017, 2017-2021, 2021-2023, 2024), and to fetch additional data from Wikidata such as gender, date of birth, social media usernames, and more. The script uses the BeautifulSoup library to parse HTML content and the requests library to make HTTP requests.



In [395]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import pytz
from bs4 import BeautifulSoup
import requests
import requests
from bs4 import BeautifulSoup

def get_wiki_data(wikidata_id):
    # URL for Wikidata API endpoint
    wikidata_api_url = "https://www.wikidata.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidata_id,
        "props": "claims"  # Retrieve claims (properties) for the entity
    }

    try:
        # Send GET request to Wikidata API
        response = requests.get(url=wikidata_api_url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse JSON response
        data = response.json()

        # Extract desired properties from the response
        properties = {}

          # Extract gender
        if "P21" in data["entities"][wikidata_id]["claims"]:
            gender_qid = data["entities"][wikidata_id]["claims"]["P21"][0]["mainsnak"]["datavalue"]["value"]["id"]
            gender = map_gender(gender_qid)
            properties["gender"] = gender

        # Extract date of birth
        if "P569" in data["entities"][wikidata_id]["claims"]:
            bday = data["entities"][wikidata_id]["claims"]["P569"][0]["mainsnak"]["datavalue"]['value']['time']
           # bday = pd.to_datetime
            properties["birthday"] = bday

        # Extract social media usernames
        if "P2002" in data["entities"][wikidata_id]["claims"]:
            twitter_username = data["entities"][wikidata_id]["claims"]["P2002"][0]["mainsnak"]["datavalue"]["value"]
            properties["twitter_username"] = twitter_username

        # Extract Twitter numeric user ID
        if "P2002" in data["entities"][wikidata_id]["claims"]:
            twitter_id = data["entities"][wikidata_id]["claims"]["P2002"][0]["mainsnak"]["datavalue"]["value"]
            properties["twitter_numeric_user_id"] = twitter_id

        # Extract subscriber count
        if "P2002" in data["entities"][wikidata_id]["claims"]:
            qualifiers = data["entities"][wikidata_id]["claims"]["P2002"][0].get("qualifiers", {})
            if "P3744" in qualifiers:
                subscriber_count = int(qualifiers["P3744"][0]["datavalue"]["value"]["amount"])
                properties["subscriber_count"] = subscriber_count

        # Extract start date
        if "P2002" in data["entities"][wikidata_id]["claims"]:
            qualifiers = data["entities"][wikidata_id]["claims"]["P2002"][0].get("qualifiers", {})
            if "P580" in qualifiers:
                start_date_str = qualifiers["P580"][0]["datavalue"]["value"]["time"]
               # start_date = pd.to_datetime(start_date_str[1:])
                properties["start_date"] = start_date_str

        # Extract point in time
        if "P2002" in data["entities"][wikidata_id]["claims"]:
            qualifiers = data["entities"][wikidata_id]["claims"]["P2002"][0].get("qualifiers", {})
            if "P585" in qualifiers:
                point_in_time_str = qualifiers["P585"][0]["datavalue"]["value"]["time"]
                #point_in_time = pd.to_datetime(point_in_time_str[1:])
                properties["point_in_time"] = point_in_time_str

        return properties
    
    except requests.exceptions.RequestException as e:
        print("Error fetching data from Wikidata API:", e)
        return None

def map_gender(qid):
    if qid == "Q6581072":
        return "female"
    elif qid == "Q6581097":
        return "male"
    else:
        return "other/unknown"
        
def get_wikidata_id(name):
    # URL for Wikidata API endpoint
    wikidata_api_url = "https://www.wikidata.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "type": "item",
        "search": name
    }

    try:
        # Send GET request to Wikidata API
        response = requests.get(url=wikidata_api_url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse JSON response
        data = response.json()

        # Extract Wikidata ID from response
        if data["search"]:
            wikidata_id = data["search"][0]["id"]
            return wikidata_id
        else:
            return None

    except: 
        print("Error fetching wiki id:", e)
        return None
        

def scrape_tweedekamer(url):

    ''' This function gets all current Tweede Kamerleden as registered on tweedekamer.nl'''
    
    response = requests.get(url)
    # Parse the HTML
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all member cards
    member_cards = soup.find_all("div", class_="u-member-card-height")

    # List to store information for all members
    members_info = []

    # Iterate through each member card
    for card in member_cards:
        # Extract name and party
        name = card.find("a", class_="u-text-size--large").text.strip()
        party = card.find("span", class_="u-text-size--small").text.strip()

        wikidata_id = get_wikidata_id(name)

        # Extract image URL if it exists
        image_tag = card.find("img", class_="m-avatar__image")
        image_url = image_tag["src"] if image_tag else None

        # Extract additional details from the table
        table_rows = card.find("table", class_="u-text-size--small").find_all("tr")

        # Initialize variables to store location, age, and seniority
        location = ""
        age = ""
        seniority = ""

        # Iterate through each table row and extract information
        for row in table_rows:
            header = row.find("th").text.strip()
            value = row.find("td").text.strip()
            if "Woonplaats" in header:
                location = value
            elif "Leeftijd" in header:
                age = value
            elif "Anciënniteit" in header:
                seniority = value
        # Store the information in a dictionary
        member_info = {
            "Name": name,
            "Party": party,
            "Location": location,
            "Age": age,
            "Seniority": seniority,
            "Image_URL": image_url,
            "Wikidata_ID": wikidata_id
        }

        # Append the dictionary to the list
        members_info.append(member_info)

    # Get Wikidata properties for each member
    for member in members_info:
        # Get Wikidata properties only if Wikidata ID is available
        if member["Wikidata_ID"]:
            wiki_data = get_wiki_data(member["Wikidata_ID"])
            # Merge Wikidata properties into member info dictionary
            member.update(wiki_data)
            
    df = pd.DataFrame(members_info)
    return df

def scrape_politicians(url):
    
    ''' This function gets all Tweede Kamerleden in the period 2017-2013 and 2012-2017 from wikipedia'''

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Create a BeautifulSoup object
    
    rows = soup.find_all('tr')
    
    
    # Initialize a list to store the scraped data
    scraped_data = []
    
    # Loop through each row and extract the desired information
    for row in rows:
        # Find all table data (td) within the row
        cells = row.find_all('td')
        
         # Extract information from cells if they exist and the length is as expected
        if len(cells) >= 4:
            try:
                name = row.find('a').text.strip()  # Extract the name
            except AttributeError:
                continue  # Skip this row if the 'a' tag is not found
            party = cells[1].text.strip()      # Extract the party
            
            # Use try-except block to handle potential IndexError
            try:
                start_date = cells[2].text.strip() # Extract the start date
                end_date = cells[3].text.strip()   # Extract the end date
            except IndexError:
                start_date = "N/A"
                end_date = "N/A"
            
            wiki_id = get_wikidata_id(name)
            if wiki_id is not None: 
                wikidata = get_wiki_data(wiki_id)
            
            # Append the extracted information to the scraped_data list
            scraped_data.append({
                'Name': name,
                'Party': party,
                'Start Date': start_date,
                'End Date': end_date, 
                'Wikidata_ID' : wikidata_id , 
                **wikidata
            })
        
    return scraped_data


def scrape_20172021(url):
        
    ''' This function gets all Tweede Kamerleden in the period 2017-2021 from wikipedia'''

    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    scraped_data = []
    # Find all sections containing party information
    party_sections = soup.find_all('span', class_='mw-headline')
    
    for party_section in party_sections:
        party_name = party_section.text.strip()
        
        # Replace "Samenstelling" with "VVD"
        if party_name == "Samenstelling":
            party_name = "VVD"
        
        # Find the table within the party section
        table = party_section.find_next('table', class_='wikitable sortable')
        if table:
            # Extract data from the table
            rows = table.find_all('tr')
            current_name = None  # To track the name being processed
            for row in rows:
                columns = row.find_all('td')
                if columns:
                    name = columns[0].find('a')
                    if name:
                        current_name = name.text.strip()
                    else:
                        current_name = "N/A"
                    
                    # Check if the column index exists before accessing it
                    if len(columns) > 1:
                        start_date = columns[1].text.strip()
                    else:
                        start_date = "N/A"
                    
                    if len(columns) > 2:
                        end_date = columns[2].text.strip()
                    else:
                        end_date = "N/A"
                        
                    wiki_id = get_wikidata_id(current_name)
                    
                    if wiki_id is not None: 
                        wikidata = get_wiki_data(wiki_id)
                    
                    # Append the extracted information to the scraped_data list
                    scraped_data.append({
                        'Name': current_name,
                        'Party': party_name,
                        'Start Date': start_date,
                        'End Date': end_date, 
                        'Wikidata_ID' : wikidata_id , 
                        **wikidata
                    })
        
    return scraped_data


## Call functions and create dataframes

In [396]:
## 2012 -- 2017

wiki_url_2012= 'https://nl.wikipedia.org/wiki/Lijst_van_Tweede_Kamerleden_2012-2017'
tweede_kamerleden2012_2017 = pd.DataFrame(scrape_politicians(wiki_url_2012))
tweede_kamerleden2012_2017['period'] = "2012_2017"
## 2017 -- 2021

wiki_url_2017 = 'https://nl.wikipedia.org/wiki/Lijst_van_Tweede_Kamerleden_2017-2021'
tweede_kamerleden2017_2021 = pd.DataFrame(scrape_20172021(wiki_url_2017))
tweede_kamerleden2017_2021['period'] = "2017_2021"

## 2021 -- 2023

wiki_url = "https://nl.wikipedia.org/w/index.php?title=Lijst_van_Tweede_Kamerleden_2021-2023"
tweede_kamerleden2021_2023 = pd.DataFrame(scrape_politicians(wiki_url))
tweede_kamerleden2021_2023['period'] = "2021_2023"

## 2024 (current)

tk_url = "https://www.tweedekamer.nl/kamerleden_en_commissies/alle_kamerleden"
tweede_kamerledencurrent= scrape_tweedekamer(tk_url)
tweede_kamerledencurrent['period'] = '2024'

combined_data = pd.concat([tweede_kamerleden2012_2017, tweede_kamerleden2017_2021, tweede_kamerleden2021_2023, tweede_kamerledencurrent], axis=0) 
combined_data.to_csv('../data/tweedekamerleden.csv')

# Filter out rows where 'twitter_username' is not NaN
non_null_twitter_usernames = combined_data[combined_data['twitter_username'].notna()]

print(len(non_null_twitter_usernames))
# Extract unique Twitter usernames
unique_usernames = list(set(non_null_twitter_usernames['twitter_username']))

print(len(unique_usernames))
# Define the path for the output text file
output_file_path = "../data/MPs_twitter_usernames.txt"

# Write the unique Twitter usernames to the text file
with open(output_file_path, "w") as file:
    for username in unique_usernames:
        file.write(username + "\n")

print("Unique Twitter usernames have been written to:", output_file_path)