## Sheet Overview
This sheet scrapes the geographic.org website for all the street names in Boston and stores them in a list

### Necessary Imports

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd

### Set the url for scraping

In [4]:
scrape_url = 'https://geographic.org/streetview/usa/ma/middlesex/cambridge.html'
scrape_url

'https://geographic.org/streetview/usa/ma/middlesex/cambridge.html'

### Request street names from url and check status

In [5]:
res = requests.get(scrape_url)
res.status_code

200

### Use BeautifulSoup to parse the content of the Boston Street name web page

In [6]:
soup = BeautifulSoup(res.content, 'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="Cambridge, Middlesex, Massachusetts, United States, maps, List of Streets, Street View, Geographic.org" name="Description"/>
<meta content="Cambridge, Middlesex, Massachusetts, List of Streets, maps, United States, Street View, Geographic.org" name="keywords"/>
<title>List of Streets in Cambridge, Middlesex, Massachusetts, United States, Google Maps and Photos Streetview, United States, List of Streets, Google Street View, Geographic.org</title>
<!--Taboola Head Section-->
<script type="text/javascript">
        window._taboola = window._taboola || [];
        _taboola.push({article:'auto'});
        !function (e, f, u) {
            e.async = 1;
            e.src = u;
            f.parentNode.insertBefore(e, f);
        }(document.createElement('script'),
            document.getElementsByTagName('script')[0],
            '//cdn.taboola.com/libtrc/fotios-geographicorg/loader.js');
    </script>
<!--End Taboola Head S

### Isolate the street names table using soup.find

In [7]:
street_name_table = soup.find('ul')
street_name_table

<ul>
<li><a alt="1st Street" href="../../../view.php?place=1st Street, Cambridge, Middlesex, Massachusetts, 2141, United States">1st Street</a>   2141</li>
<li><a alt="1st Street" href="../../../view.php?place=1st Street, Cambridge, Middlesex, Massachusetts, 2142, United States">1st Street</a>   2142</li>
<li><a alt="2nd Street" href="../../../view.php?place=2nd Street, Cambridge, Middlesex, Massachusetts, 2141, United States">2nd Street</a>   2141</li>
<li><a alt="2nd Street" href="../../../view.php?place=2nd Street, Cambridge, Middlesex, Massachusetts, 2142, United States">2nd Street</a>   2142</li>
<li><a alt="3rd Street" href="../../../view.php?place=3rd Street, Cambridge, Middlesex, Massachusetts, 2141, United States">3rd Street</a>   2141</li>
<li><a alt="3rd Street" href="../../../view.php?place=3rd Street, Cambridge, Middlesex, Massachusetts, 2142, United States">3rd Street</a>   2142</li>
<li><a alt="4th Street Place" href="../../../view.php?place=4th Street Place, Cambridge, 

### Create a list of street names by extratcating them individually from the street name table

In [8]:
cambridge_streets = []

for li in street_name_table.find_all('li'):
    cambridge_streets.append(li.text)

In [9]:
cambridge_streets[0:10]

['1st Street \xa0 2141',
 '1st Street \xa0 2142',
 '2nd Street \xa0 2141',
 '2nd Street \xa0 2142',
 '3rd Street \xa0 2141',
 '3rd Street \xa0 2142',
 '4th Street Place \xa0 2141',
 '5th Street \xa0 2141',
 '5th Street \xa0 2142',
 '6th Street \xa0 2141']

### Clean the street names by stripping the uneccsary info and only selecting the firest word of each street


In [10]:
# Split street name into individual pieces
cambridge_streets =[streets.split(" ") for streets in cambridge_streets]
cambridge_streets[:10]

[['1st', 'Street', '\xa0', '2141'],
 ['1st', 'Street', '\xa0', '2142'],
 ['2nd', 'Street', '\xa0', '2141'],
 ['2nd', 'Street', '\xa0', '2142'],
 ['3rd', 'Street', '\xa0', '2141'],
 ['3rd', 'Street', '\xa0', '2142'],
 ['4th', 'Street', 'Place', '\xa0', '2141'],
 ['5th', 'Street', '\xa0', '2141'],
 ['5th', 'Street', '\xa0', '2142'],
 ['6th', 'Street', '\xa0', '2141']]

In [11]:
# Select the first word of each street name
cambridge_streets =[streets[0] for streets in cambridge_streets]

# Remove all street names with name length less than 3 characters
cambridge_streets =[streets for streets in cambridge_streets if len(streets)>=3]
cambridge_streets[:10]

['1st', '1st', '2nd', '2nd', '3rd', '3rd', '4th', '5th', '5th', '6th']

In [12]:
%store cambridge_streets

Stored 'cambridge_streets' (list)


Above code adapted from Grant Wilson of San Francisco cohort

## Create a csv file with all the Boston Street Names

In [13]:
df = pd.DataFrame(cambridge_streets)

In [14]:
df.to_csv('cambridge.csv', index = False)