# Toronto Project

#### Downloading libraries

In [1]:
try:
    print('Downloading needed packages...')
    !conda install -c conda-forge beautifulsoup4 --yes # web scraping
    !conda install -c conda-forge lxml --yes # parser for html web scraping
    print('Success: All lib downloaded')
except:
    print('Error')

Downloading needed packages...
Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - progressbar2


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    pytest-runner-5.1          |             py_0           9 KB  conda-forge
    progressbar2-3.38.0        |             py_1          19 KB  conda-forge
    python-utils-2.3.0         |             py_1          11 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          40 KB

The following NEW packages will be INSTALLED:

    progressbar2:  3.38.0-py_1 conda-forge
    pytest-runner: 5.1-py_0    conda-forge
    python-utils:  2.3.0-py_1  conda-forge


Downloading and Extracting Packages
pytest-runner-5.1    | 9 K

#### Importing Libraries

In [106]:
try:
    print("Importing libraries....")
    from bs4 import BeautifulSoup as bts # web scraping
    import numpy as np # vectorized manner
    import pandas as pd # data analysis
    import requests # requests
    import lxml 
    print("Success: libraries imported")
except:
    print("Error: Not all libraries were imported")

Importing libraries....
Success: libraries imported


#### Using beautifulsoup to scrape data

In [107]:
try:
    print("Getting wiki scrape, source.text")
    url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
    source = requests.get(url).text
    print("Success: wiki, source")
except:
    print("Error")


Getting wiki scrape, source.text
Success: wiki, source


#### Creating csv

In [328]:
filename = "toronto_canada.csv"
f = open(filename, 'w')

In [329]:
soup = bts(source, 'lxml')

#### Using beautiful soup to find a global div then finding a table row, then table header

In [330]:
try: 
    print("Cleaning...")
    article = soup.find('div', 'mw-content-ltr')
    table = article.table
    rows = table.tbody.find_all('tr')
    column_names = rows[0].find_all('th')
    headers = ""
    for column_name in column_names:
        headers = headers + column_name.text + ","
    headers = headers[:-1]
    print("Cleaning: successful")
except:
    print("Error")


Cleaning...
Cleaning: successful


#### Validating column names

In [331]:
print(column_names)

[<th>Postcode</th>, <th>Borough</th>, <th>Neighbourhood
</th>]


#### Writing headers

In [336]:
f.write(headers)

31

#### Table data

In [333]:
for row in rows:
    cells = row.find_all('td')
    row_buff = ""
    for cell in cells:
        row_buff = row_buff + cell.text + ','

    row_buff = row_buff[:-1]
    f.write(row_buff)

#### Using pandas now to read data from csv file, already imported the library from the top of the file

In [365]:
df = pd.read_csv("toronto_canada.cvs")

#### What the data will look like before cleaning: sorting, removing 'Not assigned'

In [348]:
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [364]:
borough_area = df.index[df['Borough'] == 'Not assigned']
neighbourhood_area = df.index[df['Neighbourhood'] == 'Not assigned']
print(borough_area.shape[0])
print(neighbourhood_area.shape[0])

0
0


#### Generalize stats of the data 

In [349]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,288,288,288
unique,180,12,209
top,M8Y,Not assigned,Not assigned
freq,8,77,78


#### Showing a tuple from the dataframe

In [350]:
df.shape

(288, 3)

In [351]:
df.loc[:12,["Postcode", "Borough", "Neighbourhood"]]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#### Dropping data

In [356]:
df.drop(df.index[borough_area], inplace=True)
df.drop(df.index[neighbourhood_area], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Malvern
8,M3B,North York,Don Mills North
9,M4B,East York,Woodbine Gardens


#### After cleaning

In [358]:
borough_area = df.index[df['Borough'] == 'Not assigned']
neighbourhood_area = df.index[df['Neighbourhood'] == 'Not assigned']
print(borough_area.shape[0])
print(neighbourhood_area.shape[0])

0
0


In [360]:
print('Boroughs: {}'.format(df['Borough'].unique().shape[0]))
print('Postal Codes: {}'.format(df['Postcode'].unique().shape[0]))
print('Neighbourhoods: {}'.format(df['Neighbourhood'].unique().shape[0]))

Boroughs: 10
Postal Codes: 102
Neighbourhoods: 207


In [361]:
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Malvern
8,M3B,North York,Don Mills North
9,M4B,East York,Woodbine Gardens


In [362]:
print(df.shape)

(209, 3)
