# Best place to open a new Movie Theater as per business competition in Hyderbad, India

##### Importing the necessary libraries.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from IPython.display import display
import numpy as np
import os

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim
import geopy.distance
import folium
from folium.plugins import MarkerCluster

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

##### Scraping neighborhood data of hyderabad from wikipedia [page](https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Hyderabad,_India) 

Function to find latitude and longitude of a place.

In [None]:
geolocator = Nominatim(user_agent="ny_explorer")

def latlng(place):
    address = 'Hyderabad, '+place
    location = None
    i = 0
    while location is None:
        i += 1
        if(i == 10):
            return ('Not Found', 'Not Found')
        try:
            location = geolocator.geocode(address)
        except:
            pass
    
    return (location.latitude, location.longitude)

Scarapping and generating a dataframe.

In [None]:
source = requests.get('https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Hyderabad,_India').text
soup = BeautifulSoup(source, 'lxml')

hyd_neighborhood = {'Neighborhood':[],'Latitude':[],'Longitude':[]}
divs = soup.find_all('div', class_="mw-category-group")

for div in divs:
    ulist = div.find_all('a')
    for li in ulist:
        if li.text.endswith(')'):
            hyd_neighborhood['Neighborhood'].append(li.text.split(' (')[0])
        else:
            hyd_neighborhood['Neighborhood'].append(li.text.split(',')[0])
        lat, lng = latlng(li.text)
        print(li.text, lat, lng)
        hyd_neighborhood['Latitude'].append(lat)
        hyd_neighborhood['Longitude'].append(lng)

hyd_neighborhood = pd.DataFrame(hyd_neighborhood)
hyd_neighborhood = hyd_neighborhood[hyd_neighborhood['Latitude'] != 'Not Found']
display(hyd_neighborhood.head(10))
print('Shape of the resulting dataframe is '+str(hyd_neighborhood.shape))

Plotting the neighborhood points on map.

In [None]:
location = None
while location is None:
    try:
        location = geolocator.geocode('Hyderabad, India')
    except:
        pass

latitude = location.latitude
longitude = location.longitude
    
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
    label = '{}'.format(neighborhood)
    print(lat, lng, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

##### Using the foursquare api to get venue details.

Define API credentials.

In [None]:
clientID = 'LKYB2D43HOP4CK5MYWZ0YGHHJFLCKMDVIJVFS2WRWZJVVF03'
clientSecret = 'IE4HFIDTJHUSLMSNIBJ40UGURYBBRGXFD4UIPKX5YUKC0RO4'
version = '20180605'

print('Credentails:')
print('CLIENT ID: ' + clientID)
print('CLIENT SECRET:' + clientSecret)

Using the API to to get the venue data.

In [2]:
# def getNearbyVenues(names, latitudes, longitudes, radius=2000, limit=100):
#     venues_list=[]
    
#     for name, lat, lng in zip(names, latitudes, longitudes):
#         url = 'https://api.foursquare.com/v2/venues/explore?&client_id='+clientID+'&client_secret='+clientSecret+'&v='+version+'&ll='+str(lat)+','+str(lng)+'&radius='+str(radius)+'&limit='+str(limit)     
#         results = requests.get(url).json()["response"]['groups'][0]['items']
        
#         venues_list.append([(
#             name, 
#             lat, 
#             lng, 
#             v['venue']['name'], 
#             v['venue']['location']['lat'], 
#             v['venue']['location']['lng'],  
#             v['venue']['categories'][0]['name']) for v in results])

#     nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
#     nearby_venues.columns = ['Neighborhood', 
#                   'Neighborhood Latitude', 
#                   'Neighborhood Longitude', 
#                   'Venue', 
#                   'Venue Latitude', 
#                   'Venue Longitude', 
#                   'Venue Category']
    
#     return(nearby_venues)

# venues = getNearbyVenues(names=hyd_neighborhood['Neighborhood'], latitudes=hyd_neighborhood['Latitude'], longitudes=hyd_neighborhood['Longitude'])
venues = pd.read_csv('hyderabad.csv')
display(venues.head(10))
print('Shape of the resulting dataframe is'+str(venues.shape))
venues.to_csv('hyderabad.csv')

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,A. S. Rao Nagar,17.47995,78.556834,The Coffee Cup,17.48318,78.552104,Café
1,1,A. S. Rao Nagar,17.47995,78.556834,Cafe Coffee Day,17.481262,78.555077,Café
2,2,A. S. Rao Nagar,17.47995,78.556834,Fifth Avenue Bakers,17.487673,78.542793,Bakery
3,3,A. S. Rao Nagar,17.47995,78.556834,Domino's Pizza,17.475035,78.553141,Pizza Place
4,4,A. S. Rao Nagar,17.47995,78.556834,Woodland Restaurant,17.476646,78.566404,Snack Place
5,5,A. S. Rao Nagar,17.47995,78.556834,Parivaar Restaurant,17.47685,78.563525,Indian Restaurant
6,6,A. S. Rao Nagar,17.47995,78.556834,KFC,17.47504,78.553137,Fast Food Restaurant
7,7,A. S. Rao Nagar,17.47995,78.556834,Mama Mia Italia,17.487395,78.540078,Pizza Place
8,8,A. S. Rao Nagar,17.47995,78.556834,McDonald's,17.476961,78.564754,Fast Food Restaurant
9,9,A. S. Rao Nagar,17.47995,78.556834,Swagath Grand,17.482022,78.553261,Indian Restaurant


Shape of the resulting dataframe is(5735, 8)


###### Lets take the closest neighborhoods of each place and create a new dataframe. We use the latitude and longitude for this and calculate the distance, set a threshold for distance and consider it a neighbor for the respective place if the distance falls below the thresold.

In [None]:
neighbours = {'Place':[], 'Neighbors':[]}
threshold = 4.0

for lat, lng, place in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
    neighbours['Place'].append(place)
    neighbours['Neighbors'].append('')
    for lat2, lng2, place2 in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
        if geopy.distance.distance((lat, lng), (lat2, lng2)) < threshold:
            neighbours['Neighbors'][neighbours['Place'].index(place)] += place2+','
neighbours = pd.DataFrame(neighbours)
display(neighbours.head(10))

#### THE MAIN IDEA OF SOLVING THE PROBLEM
Now lets seperate only the Movie theatres venue category from the venue list. We will divide all the neighborhoods in Hyderabad into 3 clusters depending on the number of movie theatres. Lets take cluster-1 has many theathres, cluster-2 with a few theatres and cluster-3 with very less or no theatres. Clearly we would want to set up our new theatre in the cluster-3. <br/>Now the purpose of the neighboring places of all the places(in the neighbors dataframe) is to further filter the neighborhoods. If the particular neighborhood in cluster-3 is close to another neighborhood which is in cluster-1, there can be a slight competition. On a whole the best place to set up a new theatre would be a place where there are not many theatre both in that particular place and the neighboring places, thus ensuring least competition. 

###### Creating a new dataframe with only the venue categories of movie theatres

In [3]:
theatres = venues[(venues['Venue Category'] == 'Indie Movie Theater') | (venues['Venue Category'] == 'Multiplex')].reset_index(drop=True)
display(theatres.head(10))
print(theatres.shape)

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,24,A.C. Guards,17.402804,78.459487,IMAX Screen,17.413041,78.465618,Multiplex
1,35,A.C. Guards,17.402804,78.459487,Prasad's IMAX,17.413054,78.465578,Multiplex
2,45,A.C. Guards,17.402804,78.459487,Prasads Screen 2,17.412897,78.465712,Multiplex
3,78,A.C. Guards,17.402804,78.459487,Prasads imax,17.413141,78.465634,Multiplex
4,95,A.C. Guards,17.402804,78.459487,Prasad's Screen 5,17.413044,78.465607,Multiplex
5,100,A.C. Guards,17.402804,78.459487,Prasads screen 1,17.412967,78.465659,Multiplex
6,167,Abids,17.389478,78.477182,Inox Maheshwari Paremeshwari,17.390728,78.488352,Multiplex
7,192,Abids,17.389478,78.477182,Tarakarama Cineplex,17.390854,78.488539,Indie Movie Theater
8,355,Ameerpet,17.437501,78.448251,PVR Cinemas,17.426516,78.453261,Multiplex
9,398,Ameerpet,17.437501,78.448251,Pvr Irrum Manzil,17.420914,78.455236,Multiplex


(151, 8)


##### Lets plot these theatres on the map.

In [7]:
# location = None
# while location is None:
#     try:
#         location = geolocator.geocode('Hyderabad, India')
#     except:
#         pass
# print('Latitude and Longitude of Hyderabad is: {}, {}'.format(latitude , longitude))
latitude = 17.3850
longitude = 78.4867
    
map_theatres = folium.Map(location=[latitude, longitude], zoom_start=11)
marker_cluster = MarkerCluster().add_to(map_theatres)

for lat, lng, venue in zip(theatres['Venue Latitude'], theatres['Venue Longitude'], theatres['Venue']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(marker_cluster)

map_theatres

In [None]:
lat_list = theatres['Venue Latitude']
lng_list = theatres['Venue Longitude']

gmap = gmplot.GoogleMapPlotter(17.3850, 78.4867, 13)
gmap.scatter(lat_list, lng_list)

gmap.draw("theatres.html")
os.system("theatres.html")

##### Making a hot encoding of venues dataframe to use in clustering