# Scraping Texas Houe of Representative Website for Rep info

## Logic and Helper Function Definition

In [3]:
### Libraries Import ###
################################################################################
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import pandas as pd


### Helper Function definitions ###
################################################################################
## Capture district info
## Notes: ResultSets are converted to NavigableString in Unicode when printed
def capture_district_info(tag_object):
    """Capture and return the district info.
    
    Tag object is used as an input instead of the soup object because we are 
    iterating through each of the table data cell element of the html page.
    
    Args:
        - tag_object (Tag): Each of the BeautifulSoup Tag object
    Returns:
        - (NavigableString): The District of the representative
    """
    result_set = tag_object(text = True, recursive = False)
    district_num = re.search("\d+", result_set[0]).group(0)
    district_num_int = int(district_num)
    return district_num_int


## Capture Rep name
def capture_rep_name(tag_object):
    """Capture and return the representative's name from the tag object.
        
    Tag object is used as an input instead of the soup object because we are 
    iterating through each of the table data cell element of the html page.
    """
    result_set = tag_object.find_all("strong")
    return result_set[0].text

def capture_last_name(tag_object):
    """Capture and return the representative's last name from the tag object.
    
    Tag object is used as an input instead of the soup object because we are 
    iterating through each of the table data cell element of the html page.
    """
    full_rep_name = capture_rep_name(tag_object)
    
    ## Replace non-breakable space
    full_rep_name = bytes(full_rep_name, 'utf8')
    full_rep_name = re.sub(b"\\xc2\\xa0", b" ", full_rep_name)
    full_rep_name = full_rep_name.decode('utf8')
    
    result_list = full_rep_name.split(sep = " ")
    last_name = result_list[1].strip(", ")
    return last_name

def capture_first_name(tag_object):
    """Capture and return the representative's first name from the tag object.
        
    Tag object is used as an input instead of the soup object because we are 
    iterating through each of the table data cell element of the html page.
    """
    full_rep_name = capture_rep_name(tag_object)
    
    ## Replace non-breakable space
    full_rep_name = bytes(full_rep_name, 'utf8')
    full_rep_name = re.sub(b"\\xc2\\xa0", b" ", full_rep_name)
    full_rep_name = full_rep_name.decode('utf8')
    
    result_list = full_rep_name.split(sep = " ")
    first_name = result_list[2].strip(" ")
    return first_name

def capture_middle_name(tag_object):
    """Capture and return the representative's middle name from a tag object.
    
    Tag object is used as an input instead of the soup object because we are 
    iterating through each of the table data cell element of the html page.
    
    """
    full_rep_name = capture_rep_name(tag_object)
    
    ## Replace non-breakable space
    full_rep_name = bytes(full_rep_name, 'utf8')
    full_rep_name = re.sub(b"\\xc2\\xa0", b" ", full_rep_name)
    full_rep_name = full_rep_name.decode('utf8')
    
    result_list = full_rep_name.split(sep = " ")
    middle_name = " ".join(result_list[3:]).strip(" ")
    return middle_name


## Capture rep page sub-url
# print(first_rep.a["href"])
def house_rep_url(tag_object):
    """Return the house member homepage url.
    """
    baseUrl = "https://house.texas.gov/members/"
    subUrl = tag_object.a["href"]
    return urljoin(baseUrl, subUrl)



## Capture rep photo source
# print(first_rep.img["src"])
def house_rep_photo_url(tag_object):
    """Return the url of the house rep's photo.
    """
    baseUrl = "https://house.texas.gov/members/"
    subUrl = tag_object.img["src"]
    return urljoin(baseUrl, subUrl)


### Helper Function Definitions ###
################################################################################

def read_and_parse_url(url):
    """Read and parse url and return Beautiful Soup object.
    """
    
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    return soup

## Capture English bio
def capture_en_bio(soup):
    """Take soup object and return the English bio text.
    """
                         
    en_bio_result_set = (soup.find("div", {"id": "member-biography"})
                             .find(lambda tag: (tag.name=="div") & 
                                   (tag.get("class")==["bio_en"])))
    if (en_bio_result_set==None):
        return  ## Empty bio
    
    en_bio_result_set = en_bio_result_set(text=True, recursive=False)
    en_bio = "".join(en_bio_result_set).strip()
    return en_bio

## Capture Spanish bio
def capture_es_bio(soup):
    """Take soup object and reutrn the Spanish bio text."""
    
    es_bio_result_set = (soup.find("div", {"id": "member-biography"})
                             .find(lambda tag: (tag.name=="div") & 
                                               (tag.get("class")==["bio_es"])))
    if (es_bio_result_set==None):
        return  ## Empty bio
    
    es_bio_result_set = es_bio_result_set(text=True, recursive=False)
    es_bio = "".join(es_bio_result_set).strip()
    return es_bio

## Capture email button link
def capture_email_button_link(soup, url):
    """Take soup object and return email button link."""

    email_tag = soup.find(lambda tag: (tag.name == "h4") &
                                      (tag.get("class") == ["button-email"]))
    email_sub_url = email_tag.find("a").get("href")

    merged_email = urljoin(url, email_sub_url)
    return merged_email

## Capture Capitol Address, Phone, and Fax
def capture_capitol_office_info(soup):
    """Take soup object and output capitol office info as a dict."""

    temp = soup.find(lambda tag: (tag.name=="p") & 
                                 (tag.get("class")==["double-space"])).text

    temp = temp.splitlines()
    if len(temp) == 0:
        return
    capitol_info_dict = {"address_line_1": temp[0], 
                         "address_line_2": temp[2] if temp[2] else "",
                         "address_city": temp[4].split(" ")[0].strip(", ") if len(temp[4].split(" "))>=1 else "",
                         "address_state": temp[4].split(" ")[1].strip() if len(temp[4].split(" "))>=2 else "",
                         "address_zip": temp[4].split(" ")[2].strip() if len(temp[4].split(" "))>=3 else "",
                         "phone_1": re.sub(r"[() -]", "", temp[6]) if len(temp) >= 7 else "",
                         "fax_1": re.sub(r"[a-zA-Z() -]", "", temp[8]) if len(temp)>=9 else ""
                        }
    return capitol_info_dict


## Capture District Address, Phone, and Fax
def capture_district_office_info(soup):
    """Take soup object and return dict of district office info."""
    
    temp = soup.find(lambda tag: (tag.name=="p") & 
                                 (tag.get("class")==["double-space"])).find_next_sibling("p").text

    temp = temp.splitlines()

    if len(temp) < 7:  # Temp fix
        return
    
    district_info_dict = {"address_line_1": temp[0].split(",")[0].strip() if len(temp[0].split(","))>1 else temp[0], 
                          "address_line_2": temp[0].split(",")[1].strip() if len(temp[0].split(","))>1 else "",
                          "address_city": temp[2].split(" ")[0].strip(", ") if len(temp[2].split(" "))>=1 else "",
                          "address_state": temp[2].split(" ")[1].strip() if len(temp[2].split(" "))>=2 else "",
                          "address_zip": temp[2].split(" ")[2].strip() if len(temp[2].split(" "))>=3 else "",
                          "phone_1": re.sub(r"[() -]", "", temp[4]) if len(temp)>=5 else "",
                          "fax_1": re.sub(r"[a-zA-Z() -]", "", temp[6]) if len(temp)>=7 else ""
                         }
    return district_info_dict


## Capture Committee Affiliation
def capture_committee_affiliation(soup, url):
    """Take soup object and return dict of committee affiliation."""
    
    temp = soup.find(lambda tag: (tag.name=="h4") & 
                                 (tag.text=="Committees:")).find_next_sibling("ul").find_all("li")
    committee_dict = {}
    for i in temp:    
        committee_dict[i.text] = urljoin(url, i.a["href"])

    return committee_dict

## Testing Code

In [4]:
temp = capture_district_office_info(read_and_parse_url("https://house.texas.gov/members/member-page/?district=71"))
print(temp)

None


In [5]:
# ## Ignore: These are variables to help with testing
# rep_allen_home = "https://house.texas.gov/members/member-page/?district=131"

# response = requests.get(rep_allen_home)
# html = response.text
# soup = BeautifulSoup(html, 'html.parser')

# temp = soup.find_all(lambda tag: (tag.name == "div") &
#                           (tag.get("class") == ["member-info"]))

## Data Gathering

In [13]:
## Get all rep and their districts
texas_house_87_member = "https://house.texas.gov/members/"
html = requests.get(texas_house_87_member).text
soup = BeautifulSoup(html, 'html.parser')

df_tx_house_reps = pd.DataFrame()  # DataFrame placeholder

## Extract each rep's info
for i, row in enumerate(soup.find_all("td")):

    print(i)
    
    ## Break out the loop when the 'td' tag is empty
    if len(row(text = True)) == 0:
        break
        
    ## Capture each rep's info
    dataRow = pd.DataFrame({"District": capture_district_info(row),
                            "Rep_lastName": capture_last_name(row),
                            "Rep_firstName": capture_first_name(row),
                            "Rep_middleName": capture_middle_name(row), 
                            "Rep_homepage_url": house_rep_url(row),
                            "Rep_photo_url": house_rep_photo_url(row)
                           },
                          index = [0])
    # display(dataRow)
    # print(dataRow.loc[0, "Rep_homepage_url"])
    # print(dataRow.shape)

    ## Extract info from the rep's sub-homepage
    url = dataRow.loc[0, "Rep_homepage_url"]
    rep_subpage_soup = read_and_parse_url(url)
    dataRow["Rep_en_bio"] = capture_en_bio(rep_subpage_soup)
    dataRow["Rep_es_bio"] = capture_es_bio(rep_subpage_soup)
    dataRow["Rep_contact_form_link"] = capture_email_button_link(rep_subpage_soup, url)
    dataRow["Rep_capitol_office_info"] = str(capture_capitol_office_info(rep_subpage_soup))
    dataRow["Rep_capitol_office_address_1"] = str(capture_capitol_office_info(rep_subpage_soup)["address_line_1"])
    dataRow["Rep_capitol_office_address_2"] = str(capture_capitol_office_info(rep_subpage_soup)["address_line_2"])
    dataRow["Rep_capitol_office_city"] = str(capture_capitol_office_info(rep_subpage_soup)["address_city"])
    dataRow["Rep_capitol_office_state"] = str(capture_capitol_office_info(rep_subpage_soup)["address_state"])
    dataRow["Rep_capitol_office_zip"] = str(capture_capitol_office_info(rep_subpage_soup)["address_zip"])
    dataRow["Rep_capitol_office_phone"] = str(capture_capitol_office_info(rep_subpage_soup)["phone_1"])
    dataRow["Rep_capitol_office_fax"] = str(capture_capitol_office_info(rep_subpage_soup)["fax_1"]) 
    dataRow["Rep_district_office_info"] = str(capture_district_office_info(rep_subpage_soup))
    dataRow["Rep_committee_affiliation"] = str(capture_committee_affiliation(rep_subpage_soup, url))
    for i, (key, val) in enumerate(capture_committee_affiliation(rep_subpage_soup, url).items()):
        dataRow["Rep_committee_"+str(i+1)] = key
        dataRow["Rep_committee_"+str(i+1)+"_committee_url"] = val
    
    ## Append to existing dataframe
    df_tx_house_reps = pd.concat([df_tx_house_reps, dataRow], 
                                 ignore_index = True)
    
    ## Sort the dataframe
    df_tx_house_reps = df_tx_house_reps.sort_values(by = "District")

## Display dataframe
display(df_tx_house_reps)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150


Unnamed: 0,District,Rep_lastName,Rep_firstName,Rep_middleName,Rep_homepage_url,Rep_photo_url,Rep_en_bio,Rep_es_bio,Rep_contact_form_link,Rep_capitol_office_info,...,Rep_committee_4,Rep_committee_4_committee_url,Rep_committee_5,Rep_committee_5_committee_url,Rep_committee_6,Rep_committee_6_committee_url,Rep_committee_7,Rep_committee_7_committee_url,Rep_committee_8,Rep_committee_8_committee_url
0,1,VanDeaver,Gary,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/2540.jp...,"During the 87th Texas Legislature, Dr. Gary Va...",El Dr. Gary VanDeaver desempeña su primer perí...,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E1.304', 'address_lin...",...,,,,,,,,,,
1,2,Slaton,Bryan,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/4045.jp...,"Born in Mineola, Texas, Bryan Slaton is a prou...",,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E2.420', 'address_lin...",...,,,,,,,,,,
2,3,Bell,Cecil,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/2335.jp...,"Representative Cecil Bell, Jr. is a sixth gene...",Cecil Bell (h) es texano de sexta generación y...,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E2.708', 'address_lin...",...,International Relations & Economic Development,https://house.texas.gov/committees/committee/?...,,,,,,,,
3,4,Bell,Keith,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/3695.jp...,"A lifelong Texan, State Representative Keith B...","El representante estatal Keith Bell, texano de...",https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E2.414', 'address_lin...",...,,,,,,,,,,
4,5,Hefner,Cole,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/3505.jp...,"A native East Texan, Cole Hefner lives in Moun...","Cole Hefner, nativo texano del Este, vive en M...",https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E2.718', 'address_lin...",...,Homeland Security & Public Safety,https://house.texas.gov/committees/committee/?...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,146,Thierry,Shawn,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/3390.jp...,State Representative Shawn Thierry represents ...,La Representante del Estado Shawn Thierry repr...,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E1.408', 'address_lin...",...,,,,,,,,,,
145,147,Jones,Jolanda,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/4105.jp...,,,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room 4N.10', 'address_line...",...,,,,,,,,,,
146,148,Morales,"Shaw,",Penny,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/4035.jp...,State Representative Penny Morales Shaw has de...,,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room E1.416', 'address_lin...",...,,,,,,,,,,
147,149,Vo,Hubert,,https://house.texas.gov/members/member-page/?d...,https://house.texas.gov/photos/members/4900.jp...,State Representative Hubert Vo proudly represe...,El representante estatal Hubert Vo representa ...,https://house.texas.gov/members/member-page/em...,"{'address_line_1': 'Room 4N.8', 'address_line_...",...,,,,,,,,,,


In [14]:
## Save dataframe
from datetime import datetime
suffix = (datetime.now().strftime("%Y%M%d_%H%M%S"))

df_tx_house_reps.to_csv("./data/TX_House_of_Rep_List_"+suffix+".csv", 
                        index = False)