## Analysis of business opportunities near Metro stations in Delhi

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [1]:
import requests # library to handle requests
from lxml import html # library to read html page
from bs4 import BeautifulSoup

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.48 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  30.78 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  36.59 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  46.58 MB/s
Libraries imported.


## Download and Explore Dataset

In [47]:
#Fetch metro stations list from wikipedia page.
r = requests.get("https://en.wikipedia.org/wiki/List_of_Delhi_Metro_stations")
soup = BeautifulSoup(r.content, 'html.parser')
data = []
table = soup.find('table', class_="wikitable sortable")
table_body = table.find('tbody')
rows = table_body.find_all('tr')

for row in rows:    
    cols = row.find_all(['td','th'])
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

# Create dataframe
df = pd.DataFrame(data)
df = df[1:]

# Define column header
df.columns = ['Sr','StationName','Hindi Name','Line','Opened','Layout','Notes','Refs','Platform Layout']

# Ignore additional columns
df=df[['StationName','Line','Opened','Layout']]

# Ignore cells with metro lines name as station name
df=df[~df.StationName.str.contains("Line")]

# Ignore cells with station layout 'At Grade', as these stations are for interchange only and have no exit points.
df=df[~df.Layout.str.contains("Grade")]

# Remove special symbols from column names.
df['StationName']=df['StationName'].str.replace('*','')
df['StationName']=df['StationName'].str.replace('†','')
df['StationName']=df['StationName'].str.replace('¤','')
df.reset_index(drop=True)

Unnamed: 0,StationName,Line,Opened,Layout
0,Adarsh Nagar,Yellow Line,4 February 2009,Elevated
1,AIIMS,Yellow Line,3 September 2010,Underground
2,Akshardham,Blue Line,12 November 2009,Elevated
3,Anand Vihar ISBT,Blue Line branch,6 January 2010,Elevated
4,Arjan Garh,Yellow Line,21 June 2010,Elevated
5,Arthala,Red Line,8 March 2019,Elevated
6,Ashok Park Main,Green Line,2 April 2010,Elevated
7,Ashram,Pink Line,31 December 2018,Underground
8,Azadpur,Yellow Line,4 February 2009,Elevated
9,Badarpur Border,Violet Line,14 January 2011,Elevated


#### Let's create a function to get latitude, longitude of each cell

In [49]:
def getNearbyVenues(row):
    address=row['StationName']
    add=''
    latitude = 0
    longitude = 0
    geolocator = Nominatim(user_agent="ny_explorer")
    try:
        add = address+', National Capital Territory of Delhi, IN'
        location = geolocator.geocode(add)
        latitude = location.latitude
        longitude = location.longitude
    except:
        if '–' in address:
            add = address.split('–')[0]
            add+=', National Capital Territory of Delhi, IN'
        else:  
            add = address+', IN'
        try:
            location = geolocator.geocode(add)
            latitude = location.latitude
            longitude = location.longitude 
        except:
            latitude = 0
            longitude = 0
            
    return pd.Series([latitude,longitude])

In [50]:
df[['Latitude','Longitude']] = df.apply(getNearbyVenues, axis=1)

In [51]:
df = df[df.Latitude != 0.00]

Unnamed: 0,StationName,Line,Opened,Layout,Latitude,Longitude
1,Adarsh Nagar,Yellow Line,4 February 2009,Elevated,28.614193,77.071541
2,AIIMS,Yellow Line,3 September 2010,Underground,28.566855,77.207833
3,Akshardham,Blue Line,12 November 2009,Elevated,28.612517,77.277318
4,Anand Vihar ISBT,Blue Line branch,6 January 2010,Elevated,28.646702,77.315509
6,Arjan Garh,Yellow Line,21 June 2010,Elevated,28.480716,77.125784
7,Arthala,Red Line,8 March 2019,Elevated,0.0,0.0
8,Ashok Park Main,Green Line,2 April 2010,Elevated,28.671633,77.155301
9,Ashram,Pink Line,31 December 2018,Underground,28.575177,77.256932
10,Azadpur,Yellow Line,4 February 2009,Elevated,28.707069,77.180383
12,Badarpur Border,Violet Line,14 January 2011,Elevated,28.493416,77.303334


In [38]:
df=df[~df.Latitude.str.contains("0.000000")]

Unnamed: 0,StationName,Line,Opened,Layout,latitude,longitude
1,Adarsh Nagar,Yellow Line,4 February 2009,Elevated,28.614193,77.071541
2,AIIMS,Yellow Line,3 September 2010,Underground,28.566855,77.207833
3,Akshardham,Blue Line,12 November 2009,Elevated,28.612517,77.277318
4,Anand Vihar ISBT,Blue Line branch,6 January 2010,Elevated,28.646702,77.315509
6,Arjan Garh,Yellow Line,21 June 2010,Elevated,28.480716,77.125784
7,Arthala,Red Line,8 March 2019,Elevated,0.0,0.0
8,Ashok Park Main,Green Line,2 April 2010,Elevated,28.671633,77.155301
9,Ashram,Pink Line,31 December 2018,Underground,28.575177,77.256932
10,Azadpur,Yellow Line,4 February 2009,Elevated,28.707069,77.180383
12,Badarpur Border,Violet Line,14 January 2011,Elevated,28.493416,77.303334


In [None]:
# The code was removed by Watson Studio for sharing.