# Scrapping a Wikipedia page

### <a href=https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M>View page here</a>

In [5]:
import urllib.request

In [6]:
page = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [7]:
type(page)

http.client.HTTPResponse

**Import the BeautifulSoup library so we can parse HTML and XML documents**

In [8]:
from bs4 import BeautifulSoup

In [9]:
soup = BeautifulSoup(page, "lxml")

In [10]:
#print(soup.prettify())

In [142]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

**To see all the tables present in the page**

In [13]:
all_tables=soup.find_all("table")
#all_tables

**We can see that there is a class named wikitable sortable from where the table begins**

In [14]:
right_table=soup.find('table', class_='wikitable sortable')
#right_table

In [15]:
A=[]
B=[]
C=[]


for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        b = cells[1].find(text=True)
        if b!='Not assigned\n': #To ignore rows which have Borough as Not assigned
            A.append(cells[0].find(text=True))
            B.append(cells[1].find(text=True))
            C.append(cells[2].find(text=True))

In [16]:
import pandas as pd

df=pd.DataFrame(A,columns=['Postal Code'])

df['Borough']=B
df['Neighborhood']=C

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
df.shape

(103, 3)

In [18]:
df['Borough'].unique()

array(['North York\n', 'Downtown Toronto\n', 'Etobicoke\n',
       'Scarborough\n', 'East York\n', 'York\n', 'East Toronto\n',
       'West Toronto\n', 'Central Toronto\n', 'Mississauga\n'],
      dtype=object)

**Now importing data with latitude and longitude**

In [20]:
df1 = pd.read_csv('B:/Projects/Coursera_capstone/Geospatial_Coordinates.csv')

In [21]:
df = df.sort_values('Postal Code')
df1 = df1.sort_values('Postal Code')

In [22]:
df['Longitude'] = df1['Longitude']
df['Latitude'] = df1['Latitude']

In [25]:
df = df.sort_index()

In [26]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-79.194353,43.806686
1,M4A,North York,Victoria Village,-79.160497,43.784535
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.188711,43.763573
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.216917,43.770992
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.239476,43.773136
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",-79.239476,43.744734
6,M1B,Scarborough,"Malvern, Rouge",-79.262029,43.727929
7,M3B,North York,Don Mills,-79.284577,43.711112
8,M4B,East York,"Parkview Hill, Woodbine Gardens",-79.239476,43.716316
9,M5B,Downtown Toronto,"Garden District, Ryerson",-79.264848,43.692657


**Including map of Toronto**

In [27]:
import folium

In [28]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto