In [2]:
#| echo: false
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import time
import us

## Location Data Scraping

In [3]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/118.0"
}

def get_soup(link, headers = HEADERS, timeout = 120):
    """
        Gets that tasty soup
        Arguments
        ---------
        link: str(website link)
        headers: dict {}
        timeout: int
        
        Returns
        -------
        soup
    """
    response = requests.get(link, headers = headers, timeout = timeout)
    return BeautifulSoup(response.content, "html.parser")

In [11]:
soup = get_soup("https://www.menuism.com/restaurant-locations/starbucks-coffee-39564")
starbucks = soup.find_all("a")[32:83]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/dunkin-donuts-181624")
dunkin = soup.find_all("a")[31:76]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/peets-coffee-tea-84051")
peets = soup.find_all("a")[27:36]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/tim-hortons-190025")
timh = soup.find_all("a")[27:43]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/panera-bread-4258")
panera = soup.find_all("a")[34:80]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/caribou-coffee-164861")
caribou = soup.find_all("a")[27:47]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/au-bon-pain-69342")
abp = soup.find_all("a")[28:50]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/the-coffee-bean-tea-leaf-165988")
tcbtl = soup.find_all("a")[27:35]
time.sleep(1)

soup = get_soup("https://www.menuism.com/restaurant-locations/mcdonalds-21019")
mcd = soup.find_all("a")[40:91]

In [8]:
def count_restaurants(restaurant_soup):
    """
        Creates a dictionary of state and number
        Arguments
        ---------
        restaurant_soup: soup object
        
        Returns
        -------
        {state: num_restaurants}
    """
    num_dict = {}
    for row in restaurant_soup:
        state_num = row.get_text().replace(")", "").split("(")
        if state_num[0].split(" ")[0] in ["North", "South", "West", "Rhode", "New"]:
            state = (state_num[0].split(" ")[0] + " " + state_num[0].split(" ")[1])
        elif state_num[0].split(" ")[0] == "District":
            state = "District of Columbia"
        else:
            state = state_num[0].split(" ")[0]
        num_dict[state] = state_num[1]
    return num_dict

In [20]:
num_starbucks = count_restaurants(starbucks)
num_dunkin = count_restaurants(dunkin)
num_peets = count_restaurants(peets)
num_timh = count_restaurants(timh)
num_panera = count_restaurants(panera)
num_caribou = count_restaurants(caribou)
num_abp = count_restaurants(abp)
num_tcbtl = count_restaurants(tcbtl)
num_mcd = count_restaurants(mcd)

In [23]:
def fill_df(restaurant, count, df = num_rest):
    """
        Fills values for supplied dataframe, for each state
        Arguments
        ---------
        restaurant: str(df column name)
        count: dict {}
        df: pd.DataFrame 
        
        Returns
        -------
        Nothing, function acts in-place
    """
    for i in count.keys():
        df.loc[df["State"] == i, restaurant] = count[i]
        # print(count[i])

In [24]:
def stateabb(state_names):
    abr_names = []
    for state_name in state_names:
        if state_name == "District of Columbia":
            abr_names.append("DC")
        else:
            abr_names.append(us.states.lookup(state_name).abbr)
    return abr_names

In [22]:
num_rest = pd.DataFrame(columns = ["State", "State_Abbreviation", "Starbucks", "Dunkin", "Peets", "Tim_Hortons", "Panera", "Caribou", "Au_Bon_Pain", "Coffee_Bean_Tea_Leaf", "McDonalds"])

num_rest["State"] = num_mcd.keys()

In [25]:
fill_df("Starbucks", num_starbucks)
fill_df("Dunkin", num_dunkin)
fill_df("Peets", num_peets)
fill_df("Tim_Hortons", num_timh)
fill_df("Panera", num_panera)
fill_df("Caribou", num_caribou)
fill_df("Au_Bon_Pain", num_abp)
fill_df("Coffee_Bean_Tea_Leaf", num_tcbtl)
fill_df("McDonalds", num_mcd)


24
73
33
279
2362
371
107
72
20
616
248
72
49
57
455
193
65
69
76
208
212
22
196
140
118
23
17
181
15
30
22
154
56
188
492
266
62
279
279
21
65
14
126
720
59
300
6
634
119
13
16
1
11
74
46
5
406
15
57
654
147
1
14
579
51
12
34
5
1101
201
102
91
6
21
3
63
9
185
477
7
26
1022
116
6
1
402
142
15
55
93
7
147
34
2
30
5
163
3
1
3
1
8
1
3
14
10
1
5
3
5
27
191
7
1
2
100
105
9
26
1
7
20
13
29
216
32
41
4
9
227
71
27
7
154
53
27
30
1
77
69
5
108
52
91
4
52
2
15
12
98
5
6
138
170
28
12
112
9
17
2
42
110
98
6
16
47
14
9
8
25
11
81
1
5
9
30
312
2
26
7
5
46
1
6
4
19
16
8
21
23
3
32
8
1
67
9
1
6
5
3
6
10
1
58
10
28
6
16
10
19
175
1
2
22
1
23
5
33
279
190
326
1623
237
173
37
46
1142
563
74
181
71
791
406
188
283
291
306
402
72
662
305
396
165
58
475
29
90
72
335
106
169
811
843
220
209
603
45
258
36
409
1303
145
473
30
326
353
107
34


In [30]:
num_rest = num_rest.fillna(value = 0)
num_rest["State_Abbreviation"] = stateabb(num_mcd.keys())

In [31]:
num_rest

Unnamed: 0,State,State_Abbreviation,Starbucks,Dunkin,Peets,Tim_Hortons,Panera,Caribou,Au_Bon_Pain,Coffee_Bean_Tea_Leaf,McDonalds
0,Alaska,AK,24,0,0,0,0,0,0,0,33
1,Alabama,AL,73,1,0,0,20,0,0,0,279
2,Arkansas,AR,33,11,0,0,13,0,0,0,190
3,Arizona,AZ,279,74,0,0,29,0,0,19,326
4,California,CA,2362,46,163,0,216,0,0,175,1623
5,Colorado,CO,371,5,3,0,32,9,0,0,237
6,Connecticut,CT,107,406,0,10,41,0,8,0,173
7,District of Columbia,DC,72,15,0,0,4,8,21,0,37
8,Delaware,DE,20,57,0,1,9,0,0,0,46
9,Florida,FL,616,654,0,0,227,0,23,1,1142
