# Battle of the Neighbourhoods

This notebook is for the coursera capstone project. The focus is on Atmospheric Water Generators and finding the best locations to locate them using weather data. 

#### Import packages

In [1]:
import numpy as np 
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # this module helps in web scrapping.
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
# pip install lxml
# !conda install -c conda-forge lxml --yes


#### Scrape data for list of south african provinces

Using wikipedia, data of South african provinces is made into a list

In [2]:
url = 'https://en.wikipedia.org/wiki/Provinces_of_South_Africa'
tables = pd.read_html(url)
df = tables[2]
df.head()

Unnamed: 0,Province,Name in the most spoken native language[9],Capital,Largest city,Area[10]:9,Population(2011)[10]:18,Population density(2011),Human Devel.Index (2003) [11],Location
0,Eastern Cape,iMpuma-Koloni (Xhosa),Bhisho (Bisho),Gqeberha,"168,966 km2 (65,238 sq mi)",6562053,38.8/km2 (100/sq mi),0.62,
1,Free State,Freistata (Sotho),Bloemfontein,Bloemfontein,"129,825 km2 (50,126 sq mi)",2745590,21.1/km2 (55/sq mi),0.67,
2,Gauteng,eGoli (Zulu),Johannesburg,Johannesburg,"18,178 km2 (7,019 sq mi)",12272263,"675.1/km2 (1,749/sq mi)",0.74,
3,KwaZulu-Natal,iKwaZulu-Natali (Zulu),Pietermaritzburg [n 1],Durban,"94,361 km2 (36,433 sq mi)",10267300,108.8/km2 (282/sq mi),0.63,
4,Limpopo,Limpopo (Northern Sotho),Polokwane (Pietersburg),Polokwane,"125,754 km2 (48,554 sq mi)",5404868,43.0/km2 (111/sq mi),0.59,


Edit dataframe columns 

In [3]:
clmns = ['Province', 'Population(2011)[10]:18','Population density(2011)']
# df_prov = df.iloc[0:8,:].drop(columns = drp_clmns, axis = 1)
df_prov = df.loc[0:8,clmns]
df_prov = df_prov.rename(columns = {'Population(2011)[10]:18':'Population',
                         'Population density(2011)':'Population Density'})
# df_prov['Latitude'] = ''
# df_prov['Longitude'] = ''
df_prov.loc[8,'Province'] = 'Western Cape'
df_prov

Unnamed: 0,Province,Population,Population Density
0,Eastern Cape,6562053,38.8/km2 (100/sq mi)
1,Free State,2745590,21.1/km2 (55/sq mi)
2,Gauteng,12272263,"675.1/km2 (1,749/sq mi)"
3,KwaZulu-Natal,10267300,108.8/km2 (282/sq mi)
4,Limpopo,5404868,43.0/km2 (111/sq mi)
5,Mpumalanga,4039939,52.8/km2 (137/sq mi)
6,North West,3509953,33.5/km2 (87/sq mi)
7,Northern Cape,1145861,3.1/km2 (8.0/sq mi)
8,Western Cape,5822734,45.0/km2 (117/sq mi)


Create a list of provinces 

Loop through the list of provinces to get location coordinates and add to province dataframe

In [4]:
#  Instantiate column
df_loc = pd.DataFrame(columns = ['Province', 'Latitude', 'Longitude'])

for prov in list(df_prov['Province']):
    address = prov + ', South Africa'
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)

    df_loc = df_loc.append({'Latitude': location.latitude,
                    'Longitude': location.longitude,
                           'Province': prov}, ignore_index=True)
#     print('The coordinates of {} is {} and {}'.format(address, longitude, latitude))
   

In [5]:
df_loc

Unnamed: 0,Province,Latitude,Longitude
0,Eastern Cape,-32.217183,26.63864
1,Free State,-28.785362,26.497893
2,Gauteng,-25.936344,28.081311
3,KwaZulu-Natal,-28.503833,30.887501
4,Limpopo,-23.473529,29.39592
5,Mpumalanga,-26.276849,30.150015
6,North West,-26.134782,25.654673
7,Northern Cape,-29.573402,21.205136
8,Western Cape,-33.546977,20.72753


In [6]:
address = 'South Africa'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [7]:
# create map of South Africa using latitude and longitude values
map_SA = folium.Map(location=[latitude, longitude], zoom_start=6)

# add markers to map
for lat, lng, prov in zip(df_loc['Latitude'], df_loc['Longitude'], df_loc['Province']):
    label = prov
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SA)  
    
map_SA

##  Cape Town

Scrape data of all surburbs in Cape Town

In [51]:
url = 'https://en.wikipedia.org/wiki/List_of_Cape_Town_suburbs'
tables = pd.read_html(url, attrs = {'class':'wikitable'} )


0              Bakoven
1           Bantry Bay
2            Camps Bay
3              Clifton
4             Fresnaye
5          Green Point
6             Hout Bay
7        Imizamo Yethu
8            Llandudno
9        Mouille Point
10           Sea Point
11    Three Anchor Bay
Name: Suburb, dtype: object

In [179]:
surbs = pd.DataFrame(columns = ['Suburb'])
subs = []
for table in tables:
    sb = list(table.loc[:,'Suburb'])
    subs.extend(sb)
#     sc = list(table.loc[:,'Street Code'])
#     scs = 
    
        
#     sc = table.loc[:,'Street Code']
#         surbs = surbs.append({'Suburb': sb,
#                     'Street Code': sc       
surbs['Suburb']= pd.Series(subs).values
list(surbs['Suburb']) 

['Bakoven',
 'Bantry Bay',
 'Camps Bay',
 'Clifton',
 'Fresnaye',
 'Green Point',
 'Hout Bay',
 'Imizamo Yethu',
 'Llandudno',
 'Mouille Point',
 'Sea Point',
 'Three Anchor Bay',
 'Athlone',
 'Bishop Lavis',
 'Bonteheuwel',
 'Crawford',
 'Crossroads',
 'Epping',
 'Gugulethu',
 'Hanover Park',
 'Kalksteenfontein',
 'Khayelitsha',
 'Langa',
 'Lavender Hill',
 'Manenberg',
 'Mitchells Plain',
 'Nyanga',
 'Ottery',
 'Pelican Park',
 'Philippi',
 'Samora Machel',
 'Strandfontein',
 'Valhalla Park',
 'Vrygrond',
 'Bo-Kaap (Malay Quarter)',
 "Devil's Peak Estate",
 'De Waterkant',
 'Foreshore',
 'Gardens',
 'Higgovale',
 'Lower Vrede (District Six)',
 'Oranjezicht',
 'Salt River',
 'Schotsche Kloof',
 'Tamboerskloof',
 'University Estate',
 'Vredehoek',
 'Walmer Estate (District Six)',
 'Woodstock (including Upper Woodstock)',
 'Zonnebloem (District Six)',
 'Firgrove',
 "Gordon's Bay",
 'Macassar',
 "Sir Lowry's Pass Village",
 'Somerset West',
 'Strand',
 'Belhar',
 'Bellville',
 'Brackenfe

In [201]:
#  Instantiate column
df_surbs = pd.DataFrame(columns = ['Suburbs', 'Latitude', 'Longitude'])
geolocator = Nominatim(user_agent="ny_explorer")
i = 1
for sub in list(surbs['Suburb']) :
#     address = 'Woodstock (including Upper Woodstock)' + ', South Africa'
# address = 'Salt River' + ', South Africa'
#     geolocator = Nominatim(user_agent="ny_explorer")
    try:
        location = geolocator.geocode(sub + ', Cape Town, South Africa')
        longitude = location.longitude
        latitude =location.latitude
    except:
#         location = geolocator.geocode(sub.split(' (',)[0] + ', South Africa')
#         longitude = location.longitude
#         latitude =location.latitude
        None
        
#     print( '{}. The coordinates of {} is {} and {}'.format(i, sub, longitude, latitude))
    i +=1 

    df_surbs = df_surbs.append({'Latitude': latitude,
                    'Longitude': longitude,
                           'Suburbs': sub}, ignore_index=True)
    print( '{}. The coordinates of {} is {} and {}'.format(i, sub, longitude, latitude))

2. The coordinates of Bakoven is 18.3827778 and -33.96
3. The coordinates of Bantry Bay is 18.3789695 and -33.9281512
4. The coordinates of Camps Bay is 18.38185236529182 and -33.954773599999996
5. The coordinates of Clifton is 18.3790703 and -33.9352848
6. The coordinates of Fresnaye is 18.3877426 and -33.9251944
7. The coordinates of Green Point is 18.4059682 and -33.9042571
8. The coordinates of Hout Bay is 18.355645 and -34.043093
9. The coordinates of Imizamo Yethu is 18.3619151 and -34.0288149
10. The coordinates of Llandudno is 18.3442322 and -34.008588
11. The coordinates of Mouille Point is 18.4038842 and -33.9000229
12. The coordinates of Sea Point is 18.3922222 and -33.9172222
13. The coordinates of Three Anchor Bay is 18.3955556 and -33.9091667
14. The coordinates of Athlone is 18.505 and -33.9666667
15. The coordinates of Bishop Lavis is 18.5758333 and -33.9486111
16. The coordinates of Bonteheuwel is 18.552794065056574 and -33.951612
17. The coordinates of Crawford is 18.

126. The coordinates of Tokai is 18.4416667 and -34.0608333
127. The coordinates of Wynberg is 18.4672507 and -34.0036249
128. The coordinates of Wetton is 18.5186111 and -34.0047222
129. The coordinates of Capri Village is 18.3866667 and -34.1352778
130. The coordinates of Clovelly is 18.429309632304506 and -34.1244047
131. The coordinates of Fish Hoek is 18.42467228851848 and -34.137645
132. The coordinates of Glencairn is 18.4284987 and -34.1595402
133. The coordinates of Kalk Bay is 18.44683161699767 and -34.12717245
134. The coordinates of Kommetjie is 18.3269939 and -34.1398647
135. The coordinates of Lakeside is 18.4569444 and -34.0872222
136. The coordinates of Marina da Gama is 18.4771588 and -34.0910877
137. The coordinates of Masiphumelele is 18.375791 and -34.129372
138. The coordinates of Muizenberg is 18.484570761750607 and -34.0978389
139. The coordinates of Noordhoek (PO boxes only) is 18.484570761750607 and -34.0978389
140. The coordinates of Ocean View is 18.352985 an

In [205]:
address = 'Cape Town'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of South Africa using latitude and longitude values
map_CT = folium.Map(location=[latitude, longitude], zoom_start=15)

# add markers to map
for lat, lng, prov in zip(df_surbs['Latitude'], df_surbs['Longitude'], df_surbs['Suburbs']):
    label = prov
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_CT)  
    
map_CT

## FourSquare

In [219]:
url = 'https://en.wikipedia.org/wiki/List_of_Cape_Town_suburbs'
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text 
# create a soup object using the variable 'data'
soup = BeautifulSoup(data,"html5lib")  
# in html anchor/link is represented by the tag <a>a
Link = []
for link in soup.find_all('h2'):
    link = link.get('id')
    Link.append(link)
#     print(link.get('href'))
Link

['mw-toc-heading', None, None, None, None, None, None, None, None, None, None]

In [12]:
table = soup.find('table') # in html table is represented by the tag <table>

In [17]:
soup = BeautifulSoup(data, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Cape Town suburbs - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YD4dmkZbQYfHK3-W7ieUuwAAAUw","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Cape_Town_suburbs","wgTitle":"List of Cape Town suburbs","wgCurRevisionId":999704300,"wgRevisionId":999704300,"wgArticleId":1505361,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["EngvarB from May 2013","Use dmy dates from May 2013","Suburbs of Cape Town","Lists of suburbs in South Africa","Cape Town-re

In [46]:
table = soup.find_all('table', {'class':'wikitable'}) # Find all tables with class
table[0].find_all('tbody')

for body in soup.find_all('tbody'):
    print(body.find_all('tr'))


[<tr>
<th>Suburb</th>
<th>Street Code</th>
<th>Postal Code</th>
<th>Dialing prefix
</th></tr>, <tr>
<td><a href="/wiki/Bakoven" title="Bakoven">Bakoven</a></td>
<td>8005</td>
<td>
</td></tr>, <tr>
<td><a href="/wiki/Bantry_Bay,_Cape_Town" title="Bantry Bay, Cape Town">Bantry Bay</a></td>
<td>8005</td>
<td>
</td></tr>, <tr>
<td><a href="/wiki/Camps_Bay" title="Camps Bay">Camps Bay</a></td>
<td>8005</td>
<td>8040</td>
<td>+27 21 438
</td></tr>, <tr>
<td><a href="/wiki/Clifton,_Cape_Town" title="Clifton, Cape Town">Clifton</a></td>
<td>8005</td>
<td>
</td></tr>, <tr>
<td><a href="/wiki/Fresnaye,_Cape_Town" title="Fresnaye, Cape Town">Fresnaye</a></td>
<td>8005</td>
<td>
</td></tr>, <tr>
<td><a href="/wiki/Green_Point,_Cape_Town" title="Green Point, Cape Town">Green Point</a></td>
<td>8005</td>
<td>8051
</td></tr>, <tr>
<td><a href="/wiki/Hout_Bay" title="Hout Bay">Hout Bay</a></td>
<td>7806</td>
<td>7872
</td></tr>, <tr>
<td><a href="/wiki/Imizamo_Yethu" title="Imizamo Yethu">Imizamo Yeth

In [None]:
header = soup.find_all('h2')
heads = pd.read_html(url, attrs = {'class':'mw-headline'} )
type(header)