In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lxml
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim 

In [2]:
data = requests.get("http://worldpopulationreview.com/countries/kenya-population/")
data.text

'<!DOCTYPE html><html><head><meta charSet="utf-8" class="next-head next-head"/><meta name="viewport" content="width=device-width, initial-scale=1" class="next-head"/><link href="http://fonts.googleapis.com/css?family=Arvo" rel="stylesheet" type="text/css" class="next-head"/><link href="http://fonts.googleapis.com/css?family=PT+Sans" rel="stylesheet" type="text/css" class="next-head"/><link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" class="next-head"/><link href="//netdna.bootstrapcdn.com/bootstrap/3.0.0/css/bootstrap-glyphicons.css" rel="stylesheet" class="next-head"/><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" class="next-head"/><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0-beta/css/bootstrap-grid.css" class="next-head"/><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/flexboxgrid/6.3.1/flexboxg

In [3]:
soup = BeautifulSoup(data.text)
print(soup.title)

<title class="next-head">Kenya Population 2019 (Demographics, Maps, Graphs)</title>


In [4]:
soup.title.get_text()

'Kenya Population 2019 (Demographics, Maps, Graphs)'

In [5]:
# pd.read_html("http://worldpopulationreview.com/countries/kenya-population/")

In [6]:
soup.find("table").get("class")

['table', 'table-striped']

In [7]:
range(1, len(soup.find_all("table")[3].find_all('td')))
range

range

In [8]:
city_population = {"Name":[], "Population":[]}
try:
    for i in range(2, len(soup.find_all("table")[3].find_all('td'))):
        if i % 2 == 0:
            city_population["Name"].append(soup.find_all("table")[3].find_all('td')[i].get_text())
        else:
            city_population["Population"].append(soup.find_all("table")[3].find_all('td')[i].get_text())
except:
    print("Some Error Somewhere")
        
city_population_df = pd.DataFrame(city_population)
city_population_df

Unnamed: 0,Name,Population
0,Nairobi,2750547
1,Mombasa,799668
2,Nakuru,259903
3,Eldoret,218446
4,Kisumu,216479
5,Thika,200000
6,Malindi,118265
7,Kitale,75123
8,Garissa,67861
9,Kakamega,63426


In [9]:
nairobi_df = pd.read_html("https://en.wikipedia.org/wiki/List_of_constituencies_of_Kenya", header=0)
nairobi_df[1]

Unnamed: 0,Constituency,Wards (85)
0,274. Westlands,Kitisuru • Parklands/Highridge • Karura • Kang...
1,275. Dagoretti North,Kilimani • Kawangware • Gatina • Kileleshwa • ...
2,276. Dagoretti South,Mutu-ini • Ngand'o • Riruta • Uthiru/Ruthimitu...
3,277. Lang'ata,Karen • Nairobi West • Mugumo-ini • South C • ...
4,278. Kibra,Laini Saba • Lindi • Makina • Woodley/Kenyatta...
5,279. Roysambu,Githurai • Kahawa West • Zimmerman • Roysambu ...
6,280. Kasarani,Clay City • Mwiki • Kasarani • Njiru • Ruai
7,281. Ruaraka,Babadogo • Utalii • Mathare North • Lucky Summ...
8,282. Embakasi South,Imara Daima • Kwa Njenga • Kwa Reuben • Pipeli...
9,283. Embakasi North,Kariobangi North • Dandora Area I • Dandora Ar...


In [10]:
def split_data_frame_list(df, 
                       target_column,
                      output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []
    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
              new_row = row.to_dict()
              new_row[target_column] = s
              row_accumulator.append(new_row)
          if split_row == []:
              new_row = row.to_dict()
              new_row[target_column] = None
              row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = split_row
          row_accumulator.append(new_row)
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
    return new_df

In [11]:
nairobi_df = nairobi_df[1]

In [12]:
# Clean the Column Names
nairobi_df.columns = ["Constituency", "Wards"]
nairobi_df

Unnamed: 0,Constituency,Wards
0,274. Westlands,Kitisuru • Parklands/Highridge • Karura • Kang...
1,275. Dagoretti North,Kilimani • Kawangware • Gatina • Kileleshwa • ...
2,276. Dagoretti South,Mutu-ini • Ngand'o • Riruta • Uthiru/Ruthimitu...
3,277. Lang'ata,Karen • Nairobi West • Mugumo-ini • South C • ...
4,278. Kibra,Laini Saba • Lindi • Makina • Woodley/Kenyatta...
5,279. Roysambu,Githurai • Kahawa West • Zimmerman • Roysambu ...
6,280. Kasarani,Clay City • Mwiki • Kasarani • Njiru • Ruai
7,281. Ruaraka,Babadogo • Utalii • Mathare North • Lucky Summ...
8,282. Embakasi South,Imara Daima • Kwa Njenga • Kwa Reuben • Pipeli...
9,283. Embakasi North,Kariobangi North • Dandora Area I • Dandora Ar...


In [13]:
nairobi_df.Wards = nairobi_df.Wards.apply(lambda s: s.split('•'))

In [14]:
nairobi_df.head()

Unnamed: 0,Constituency,Wards
0,274. Westlands,"[Kitisuru , Parklands/Highridge , Karura , ..."
1,275. Dagoretti North,"[Kilimani , Kawangware , Gatina , Kileleshw..."
2,276. Dagoretti South,"[Mutu-ini , Ngand'o , Riruta , Uthiru/Ruthi..."
3,277. Lang'ata,"[Karen , Nairobi West , Mugumo-ini , South ..."
4,278. Kibra,"[Laini Saba , Lindi , Makina , Woodley/Keny..."


In [15]:
nairobi_df = split_data_frame_list(nairobi_df, target_column="Wards")

In [16]:
nairobi_df.head()

Unnamed: 0,Constituency,Wards
0,274. Westlands,Kitisuru
1,274. Westlands,Parklands/Highridge
2,274. Westlands,Karura
3,274. Westlands,Kangemi
4,274. Westlands,Mountain View


In [17]:
# Remove the Digits from the Characters of the Constituency
nairobi_df.Constituency = nairobi_df.Constituency.apply(lambda x: x[4:])
nairobi_df.head()

Unnamed: 0,Constituency,Wards
0,Westlands,Kitisuru
1,Westlands,Parklands/Highridge
2,Westlands,Karura
3,Westlands,Kangemi
4,Westlands,Mountain View


In [34]:
nairobi_df.Wards.values[:5]

array(['Kitisuru\xa0', ' Parklands/Highridge\xa0', ' Karura\xa0',
       ' Kangemi\xa0', ' Mountain View'], dtype=object)

In [35]:
nairobi_df.Constituency.values[:5]

array([' Westlands', ' Westlands', ' Westlands', ' Westlands',
       ' Westlands'], dtype=object)

In [36]:
# string = string.replace(u'\xa0', u' ')
nairobi_wards=[]
for i in nairobi_df.Wards.values:
    nairobi_wards.append(i.replace(u'\xa0', u''))
    
nairobi_wards[:5]

['Kitisuru', ' Parklands/Highridge', ' Karura', ' Kangemi', ' Mountain View']

In [39]:
latitude_array = []
longitude_array = []

In [40]:
for ward, constituency in zip(nairobi_wards, nairobi_df.Constituency.values):
    try:
        location = geolocator.geocode(ward + ", Nairobi, Kenya")
        latitude_array.append(location.latitude)
        longitude_array.append(location.longitude)
    except:
        latitude_array.append(np.nan)
        longitude_array.append(np.nan)

-1.2694542
36.8355718558333


In [41]:
latitude = pd.Series(latitude_array)
longitude = pd.Series(longitude_array)

In [42]:
nairobi_df["Latitude"] = latitude
nairobi_df["Longitude"] = longitude

In [43]:
nairobi_df.isna().sum()

Constituency     0
Wards            0
Latitude        21
Longitude       21
dtype: int64

In [44]:
nairobi_df[(nairobi_df.Longitude < 37) & (nairobi_df.Longitude > 36)]

Unnamed: 0,Constituency,Wards,Latitude,Longitude
0,Westlands,Kitisuru,-1.239554,36.783965
2,Westlands,Karura,-1.236988,36.832559
3,Westlands,Kangemi,-1.265426,36.752518
4,Westlands,Mountain View,-1.271131,36.743003
5,Dagoretti North,Kilimani,-1.287442,36.784523
6,Dagoretti North,Kawangware,-1.278463,36.751643
7,Dagoretti North,Gatina,-1.165572,36.855264
8,Dagoretti North,Kileleshwa,-1.272327,36.799688
9,Dagoretti North,Kabiro,-1.287812,36.751501
12,Dagoretti South,Riruta,-1.291704,36.734693


In [46]:
missing_latlong = nairobi_df[nairobi_df.Longitude.isna()]
missing_latlong

Unnamed: 0,Constituency,Wards,Latitude,Longitude
1,Westlands,Parklands/Highridge,,
10,Dagoretti South,Mutu-ini,,
11,Dagoretti South,Ngand'o,,
13,Dagoretti South,Uthiru/Ruthimitu,,
23,Kibra,Woodley/Kenyatta Golf Course,,
30,Kasarani,Clay City,,
46,Embakasi North,Dandora Area I,,
47,Embakasi North,Dandora Area II,,
48,Embakasi North,Dandora Area III,,
49,Embakasi North,Dandora Area IV,,


In [52]:
missing_latlong.index.values

array([ 1, 10, 11, 13, 23, 30, 46, 47, 48, 49, 54, 55, 56, 62, 63, 64, 65,
       71, 72, 75, 77], dtype=int64)

In [68]:
wards_array = ["Parklands", "Mutuini", "Ngando", "Uthiru", "Woodley", "Kasarani", "Dandora", "Dandora", "Dandora", "Dandora", "Spring Valley",
"Embakasi East", "Embakasi East", "Embakasi West", "Embakasi West", "Makadara", "Maringo", "Moi Airbase", "Kamkunji", "Kariokor", "Starehe"]

In [69]:
len(wards_array)

21

In [70]:
lat_long_wards = [[],[]]

for ward in wards_array:
    try:
        location = geolocator.geocode(ward + ", Nairobi, Kenya")
        lat_long_wards[0].append(location.latitude)
        lat_long_wards[1].append(location.longitude)
    except:
        lat_long_wards[0].append(np.nan)
        lat_long_wards[1].append(np.nan)
        
print(lat_long_wards)

[[-1.2630616, -1.3, nan, -1.2631329, -1.3056567, -1.2208721, -1.2484935, -1.2484935, -1.2484935, -1.2484935, -1.2477029, -1.324728, -1.324728, -1.324728, -1.324728, -1.2873101, -1.2920714, nan, nan, -1.2788772, -1.2694542], [36.8106288, 36.7, nan, 36.7188955, 36.7760046486656, 36.9012224, 36.8973498039052, 36.8973498039052, 36.8973498039052, 36.8973498039052, 36.7892356, 36.8877240016107, 36.8877240016107, 36.8877240016107, 36.8877240016107, 36.872107163246, 36.8657259236079, nan, nan, 36.8360883, 36.8355718558333]]


In [71]:
len(lat_long_wards[0])

21

In [72]:
for df_index, array_index in zip(missing_latlong.index.values, range(len(lat_long_wards[0]))):
    nairobi_df.loc[df_index, "Latitude"] = lat_long_wards[0][array_index]
    nairobi_df.loc[df_index, "Longitude"] = lat_long_wards[1][array_index]

In [73]:
nairobi_df[nairobi_df.Longitude.isna()]

Unnamed: 0,Constituency,Wards,Latitude,Longitude
11,Dagoretti South,Ngand'o,,
71,Kamukunji,Airbase,,
72,Kamukunji,California,,


In [76]:
nairobi_df.dropna(axis=0, inplace=True)
nairobi_df[nairobi_df.Longitude.isna()]

Unnamed: 0,Constituency,Wards,Latitude,Longitude


In [29]:
# nairobi_df.loc[77, "Latitude"], nairobi_df.loc[77, "Longitude"] = -1.2694542, 36.8355718558333

In [30]:
# address = 'Starehe, Nairobi, Kenya'

# geolocator = Nominatim(user_agent="ke")
# location = geolocator.geocode(address)
# latitude = location.latitude
# longitude = location.longitude
# print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Starehe, Nairobi, Kenya are -1.2694542, 36.8355718558333.


In [77]:
nairobi_df[nairobi_df.Longitude.isna()]

Unnamed: 0,Constituency,Wards,Latitude,Longitude


In [81]:
nairobi_df.count()

Constituency    82
Wards           82
Latitude        82
Longitude       82
dtype: int64

In [82]:
nairobi_df[(nairobi_df.Longitude < 37) & (nairobi_df.Longitude > 36)].count()

Constituency    82
Wards           82
Latitude        82
Longitude       82
dtype: int64

In [85]:
nairobi_df[(nairobi_df.Latitude < -1) & (nairobi_df.Latitude > -2)].count()

Constituency    82
Wards           82
Latitude        82
Longitude       82
dtype: int64

In [88]:
nairobi_df.to_csv("constituency_ward_lat_long.csv")

In [89]:
!ls

constituency_ward_lat_long.csv
Extracting - Latitude and Longitude from Wards and Constituencies.ipynb
Extracting Nairobi Constituencies and Wards.ipynb
README.md
