# Best place to open a new Movie Theater as per business competition in Hyderbad, India

##### Importing the necessary libraries.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from IPython.display import display
import numpy as np
import os

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim
import geopy.distance
import folium
from folium.plugins import MarkerCluster

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

##### Scraping neighborhood data of hyderabad from wikipedia [page](https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Hyderabad,_India) 

Function to find latitude and longitude of a place.

In [2]:
geolocator = Nominatim(user_agent="ny_explorer")

def latlng(place):
    address = 'Hyderabad, '+place
    location = None
    i = 0
    while location is None:
        i += 1
        if(i == 10):
            return ('Not Found', 'Not Found')
        try:
            location = geolocator.geocode(address)
        except:
            pass
    
    return (location.latitude, location.longitude)

Scarapping and generating a dataframe.

In [11]:
source = requests.get('https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Hyderabad,_India').text
soup = BeautifulSoup(source, 'lxml')

hyd_neighborhood = {'Neighborhood':[],'Latitude':[],'Longitude':[]}
divs = soup.find_all('div', class_="mw-category-group")

for div in divs:
    ulist = div.find_all('a')
    for li in ulist:
        if li.text.endswith(')'):
            hyd_neighborhood['Neighborhood'].append(li.text.split(' (')[0])
        else:
            hyd_neighborhood['Neighborhood'].append(li.text.split(',')[0])
        lat, lng = latlng(li.text)
        # print(li.text, lat, lng)
        hyd_neighborhood['Latitude'].append(lat)
        hyd_neighborhood['Longitude'].append(lng)

hyd_neighborhood = pd.DataFrame(hyd_neighborhood)
hyd_neighborhood = hyd_neighborhood[hyd_neighborhood['Latitude'] != 'Not Found']
display(hyd_neighborhood.head(10))
print('Shape of the resulting dataframe is '+str(hyd_neighborhood.shape))

Unnamed: 0,Neighborhood,Latitude,Longitude
0,A. S. Rao Nagar,17.4799,78.5568
1,A.C. Guards,17.4028,78.4595
2,Abhyudaya Nagar,17.3377,78.5647
3,Abids,17.3895,78.4772
4,Adikmet,17.4095,78.5131
6,Aghapura,17.3892,78.4653
8,Alijah Kotla,17.3605,78.4801
9,Allwyn Colony,17.5044,78.415
10,Alwal,17.5022,78.5089
11,Amberpet,17.3903,78.5165


Shape of the resulting dataframe is (146, 3)


Plotting the neighborhood points on map.

In [14]:
location = None
while location is None:
    try:
        location = geolocator.geocode('Hyderabad, India')
    except:
        pass

latitude = location.latitude
longitude = location.longitude
    
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
    label = '{}'.format(neighborhood)
    # print(lat, lng, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

17.4799497 78.5568336 A. S. Rao Nagar
17.4028042 78.4594873 A.C. Guards
17.3376608 78.564716 Abhyudaya Nagar
17.3894783 78.477182 Abids
17.4095495 78.5130943 Adikmet
17.389178 78.4652731 Aghapura
17.3605451 78.4801015 Alijah Kotla
17.5043618 78.4149849 Allwyn Colony
17.5022292 78.5088584 Alwal
17.390263150000003 78.516481175 Amberpet
17.4375012 78.4482505 Ameerpet
17.4263524 78.4349398685041 Ashok Nagar
17.38362315 78.4461438818854 Asif Nagar
17.3672244 78.4307278 Attapur
17.3761935 78.490874 Azampura
17.3883755 78.4877854 Badichowdi
17.3974359 78.4979706 Bagh Lingampally
17.3410616 78.5423473 Bairamalguda
17.4469233 78.4504513 Balkampet
17.4177464 78.4399014 Banjara Hills
17.4263524 78.4349398685041 Bank Street
17.3235693 78.4791594 Barkas
17.3926121 78.4969683 Barkatpura
17.4050023 78.4769104 Basheerbagh
17.4027867 78.4604489 Bazarghat
17.3730788 78.4705549 Begum Bazaar
17.4440199 78.4624821 Begumpet
17.4629092 78.4305364 Bharat Nagar
17.45167 78.415421 Borabanda
17.4755371 78.479228

##### Using the foursquare api to get venue details.

Define API credentials.

In [4]:
clientID = 'YOUR CLIENT ID'
clientSecret = 'YOUR CLIENT SECRET'
version = '20180605'

print('Credentails:')
print('CLIENT ID: ' + clientID)
print('CLIENT SECRET:' + clientSecret)

Credentails:
CLIENT ID: LKYB2D43HOP4CK5MYWZ0YGHHJFLCKMDVIJVFS2WRWZJVVF03
CLIENT SECRET:IE4HFIDTJHUSLMSNIBJ40UGURYBBRGXFD4UIPKX5YUKC0RO4


Using the API to to get the venue data.

In [5]:
# It might take a lot of time to run this block of code, so the csv file of the same is also put in the repository,so you can
# comment the line where the below function is called and uncooment the line where the csv file is read directly.

def getNearbyVenues(names, latitudes, longitudes, radius=2000, limit=100):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id='+clientID+'&client_secret='+clientSecret+'&v='+version+'&ll='+str(lat)+','+str(lng)+'&radius='+str(radius)+'&limit='+str(limit)     
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

venues = getNearbyVenues(names=hyd_neighborhood['Neighborhood'], latitudes=hyd_neighborhood['Latitude'], longitudes=hyd_neighborhood['Longitude'])
# venues = pd.read_csv('hyderabad.csv')
display(venues.head(10))
print('Shape of the resulting dataframe is'+str(venues.shape))
#venues.to_csv('hyderabad.csv')

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,A. S. Rao Nagar,17.47995,78.556834,The Coffee Cup,17.48318,78.552104,Café
1,A. S. Rao Nagar,17.47995,78.556834,Cafe Coffee Day,17.481262,78.555077,Café
2,A. S. Rao Nagar,17.47995,78.556834,Fifth Avenue Bakers,17.487673,78.542793,Bakery
3,A. S. Rao Nagar,17.47995,78.556834,Domino's Pizza,17.475035,78.553141,Pizza Place
4,A. S. Rao Nagar,17.47995,78.556834,Woodland Restaurant,17.476646,78.566404,Snack Place
5,A. S. Rao Nagar,17.47995,78.556834,Parivaar Restaurant,17.47685,78.563525,Indian Restaurant
6,A. S. Rao Nagar,17.47995,78.556834,KFC,17.47504,78.553137,Fast Food Restaurant
7,A. S. Rao Nagar,17.47995,78.556834,Mama Mia Italia,17.487395,78.540078,Pizza Place
8,A. S. Rao Nagar,17.47995,78.556834,McDonald's,17.476961,78.564754,Fast Food Restaurant
9,A. S. Rao Nagar,17.47995,78.556834,Swagath Grand,17.482022,78.553261,Indian Restaurant


Shape of the resulting dataframe is(5735, 7)


Lets the type of venues available in the city, and their count.

In [6]:
venueCount = {}

for venue in venues['Venue Category']:
    if venue in venueCount.keys():
        venueCount[venue] += 1
    else:
        venueCount[venue] = 1
        
display(venueCount)

{'Café': 340,
 'Bakery': 186,
 'Pizza Place': 173,
 'Snack Place': 64,
 'Indian Restaurant': 753,
 'Fast Food Restaurant': 273,
 'Shopping Mall': 91,
 'Breakfast Spot': 70,
 'Falafel Restaurant': 12,
 'Hyderabadi Restaurant': 41,
 'Lounge': 62,
 'Middle Eastern Restaurant': 44,
 'Bistro': 22,
 'South Indian Restaurant': 78,
 'Multiplex': 127,
 'Science Museum': 15,
 'Ice Cream Shop': 174,
 'BBQ Joint': 44,
 'Restaurant': 132,
 'Vegetarian / Vegan Restaurant': 86,
 'Chaat Place': 27,
 'Scenic Lookout': 8,
 'Food Truck': 27,
 'Hotel': 235,
 'Juice Bar': 64,
 'Park': 41,
 'Hotel Bar': 45,
 'Chinese Restaurant': 145,
 'Donut Shop': 16,
 'Performing Arts Venue': 25,
 'Bookstore': 50,
 'Pub': 43,
 'Smoke Shop': 19,
 'Italian Restaurant': 62,
 'Stadium': 32,
 'Coffee Shop': 237,
 'Garden': 13,
 'Sandwich Place': 102,
 'Department Store': 114,
 'Bowling Alley': 23,
 'Electronics Store': 22,
 'Dessert Shop': 82,
 'Fruit & Vegetable Store': 4,
 'Arcade': 13,
 'Movie Theater': 86,
 'Shoe Store': 

The movie theatres are shown in 3 category names which are - 'Movie Theater', 'Multiplex', 'Indie Movie Theater'.

###### Lets take the closest neighborhoods of each place and create a new dataframe. We use the latitude and longitude for this and calculate the distance, set a threshold for distance and consider it a neighbor for the respective place if the distance falls below the thresold.

In [30]:
neighbours = {'Place':[], 'Neighbors':[]}
threshold = 3.0

for lat, lng, place in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
    neighbours['Place'].append(place)
    neighbours['Neighbors'].append('')
    for lat2, lng2, place2 in zip(hyd_neighborhood['Latitude'], hyd_neighborhood['Longitude'], hyd_neighborhood['Neighborhood']):
        if geopy.distance.distance((lat, lng), (lat2, lng2)) < threshold and place != place2:
            neighbours['Neighbors'][neighbours['Place'].index(place)] += place2+','
neighbours = pd.DataFrame(neighbours)
display(neighbours.head(10))

Unnamed: 0,Place,Neighbors
0,A. S. Rao Nagar,"Kapra,Kushaiguda,Moula-Ali,Neredmet,Neredmet K..."
1,A.C. Guards,"Abids,Aghapura,Asif Nagar,Banjara Hills,Bashee..."
2,Abhyudaya Nagar,"Bairamalguda,Hastinapuram,"
3,Abids,"A.C. Guards,Aghapura,Azampura,Badichowdi,Bagh ..."
4,Adikmet,"Amberpet,Bagh Lingampally,Barkatpura,Chikkadpa..."
5,Aghapura,"A.C. Guards,Abids,Asif Nagar,Badichowdi,Bashee..."
6,Alijah Kotla,"Azampura,Begum Bazaar,Chaderghat,Chanchalguda,..."
7,Allwyn Colony,"Kukatpally,Nizampet,"
8,Alwal,"Karkhana,"
9,Amberpet,"Adikmet,Bagh Lingampally,Barkatpura,Chaderghat..."


#### THE MAIN IDEA OF SOLVING THE PROBLEM
Now lets seperate only the Movie theatres venue category from the venue list. We will divide all the neighborhoods in Hyderabad into 3 clusters depending on the number of movie theatres. Lets take cluster-1 has many theathres, cluster-2 with a few theatres and cluster-3 with very less or no theatres. Clearly we would want to set up our new theatre in the cluster-3. <br/>Now the purpose of the neighboring places of all the places(in the neighbors dataframe) is to further filter the neighborhoods. If the particular neighborhood in cluster-3 is close to another neighborhood which is in cluster-1, there can be a slight competition. On a whole the best place to set up a new theatre would be a place where there are not many theatre both in that particular place and the neighboring places, thus ensuring least competition. 

###### Creating a new dataframe with only the venue categories of movie theatres

In [7]:
theatres = venues[(venues['Venue Category'] == 'Indie Movie Theater') | (venues['Venue Category'] == 'Multiplex') | (venues['Venue Category'] == 'Movie Theater')].reset_index(drop=True)
display(theatres.head(10))
print('Thus the number of theatres are {}'.format(theatres.shape[0]))

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,A.C. Guards,17.402804,78.459487,IMAX Screen,17.413041,78.465618,Multiplex
1,A.C. Guards,17.402804,78.459487,Prasad's IMAX,17.413054,78.465578,Multiplex
2,A.C. Guards,17.402804,78.459487,Prasads Screen 2,17.412897,78.465712,Multiplex
3,A.C. Guards,17.402804,78.459487,Prasads imax,17.413141,78.465634,Multiplex
4,A.C. Guards,17.402804,78.459487,Prasad's Screen 5,17.413044,78.465607,Multiplex
5,A.C. Guards,17.402804,78.459487,Prasads screen 1,17.412967,78.465659,Multiplex
6,Abhyudaya Nagar,17.337661,78.564716,Vijayalakshmi Theatre,17.343858,78.554361,Movie Theater
7,Abhyudaya Nagar,17.337661,78.564716,vanasthalipuram,17.329282,78.575298,Movie Theater
8,Abids,17.389478,78.477182,Inox Maheshwari Paremeshwari,17.390728,78.488352,Multiplex
9,Abids,17.389478,78.477182,Tarakarama Cineplex,17.390854,78.488539,Indie Movie Theater


Thus the number of theatres are 237


##### Lets plot these theatres on the map.

In [8]:
# Latitude and Longitude of Hyderabad
latitude = 17.3850
longitude = 78.4867
    
map_theatres = folium.Map(location=[latitude, longitude], zoom_start=12)
marker_cluster = MarkerCluster().add_to(map_theatres)

for lat, lng, venue in zip(theatres['Venue Latitude'], theatres['Venue Longitude'], theatres['Venue']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(marker_cluster)

map_theatres

##### Making a hot encoding of venues dataframe to use in clustering

In [17]:
# Multiplexes Indie theatres and Movie Theater categories need to be considered as theatres, so giving them a common name.

changes = {'Indie Movie Theater':'Theatre', 'Multiplex':'Theatre', 'Movie Theater':'Theatre'}
venues = venues.replace(changes)
# display(venues)        

venue_encoding = pd.get_dummies(venues['Venue Category'])
#display(venue_encoding)
venue_encoding.insert(loc=0, column='Neighbourhood', value=venues['Neighborhood'])
groups = venue_encoding.groupby('Neighbourhood').mean().reset_index()

# Considering only the theatres
groups = groups[['Neighbourhood','Theatre']]

display(groups.head(10))

Unnamed: 0,Neighbourhood,Theatre
0,A. S. Rao Nagar,0.0
1,A.C. Guards,0.06
2,Abhyudaya Nagar,0.181818
3,Abids,0.025
4,Adikmet,0.15
5,Aghapura,0.0
6,Alijah Kotla,0.0
7,Allwyn Colony,0.0
8,Alwal,0.0
9,Amberpet,0.0


###### Applying KMean Clustering on the data and divinding into 3 clusters. 

In [18]:
clusters = 5

kmeans = KMeans(n_clusters=clusters, random_state=0).fit(groups.iloc[:,1:])
kmeans.labels_[0:10]

groups.insert(loc=1, column='Cluster Labels', value=kmeans.labels_)
groups.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True) 
groups_clustered = pd.merge(hyd_neighborhood, groups, on='Neighborhood')
groups_clustered.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,Theatre
0,A. S. Rao Nagar,17.4799,78.5568,1,0.0
1,A.C. Guards,17.4028,78.4595,4,0.06
2,Abhyudaya Nagar,17.3377,78.5647,3,0.181818
3,Abids,17.3895,78.4772,1,0.025
4,Adikmet,17.4095,78.5131,0,0.15
5,Aghapura,17.3892,78.4653,1,0.0
6,Alijah Kotla,17.3605,78.4801,1,0.0
7,Allwyn Colony,17.5044,78.415,1,0.0
8,Alwal,17.5022,78.5089,1,0.0
9,Amberpet,17.3903,78.5165,1,0.0


##### Plotting the clusters on the map

In [19]:
# Latitude and Longitude of Hyderabad
latitude = 17.3850
longitude = 78.4867

map_theatres = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lng, venue, cluster, mean in zip(groups_clustered['Latitude'], groups_clustered['Longitude'], groups_clustered['Neighborhood'], groups_clustered['Cluster Labels'], groups_clustered['Theatre']):
    label = '{}, Mean: {}, Cluster:{}'.format(venue, mean, cluster)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False).add_to(map_theatres)

map_theatres

##### As seen, there is a higher concentration of theatres in the central part of the city. Places of cluster  1 have very less or no theaters, so seperating them and then compare with neighbors.
We can see from the map that many belong to cluster-1 with most of their neighbours in cluster-1 too. So lets strictly fix that no place need to have a neighborhood not belonging to cluster-1.

In [34]:
cluster1 = groups_clustered[groups_clustered['Cluster Labels'] == 1]
filtered = list(cluster1['Neighborhood'][:])

for neighborhood in cluster1['Neighborhood']:
    ind = list(neighbours['Place']).index(neighborhood)
    places = neighbours['Neighbors'][ind].split(',')[:-1]
    for place in places:
        if place not in cluster1['Neighborhood']:
            filtered.remove(neighborhood)
            break

final_list = cluster1[cluster1['Neighborhood'].isin(filtered)].reset_index(drop=True)
print("The following places have very less or no theatres, with their neighboring places also having no ")
display(final_list)

The following places have very less or no theatres, with their neighboring places also having no 


Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,Theatre
0,Cherlapally,17.4687,78.6025,1,0.0
1,Langar Houz,17.3828,78.3925,1,0.0
2,Malkajgiri mandal,17.5317,78.5243,1,0.0
3,Patancheru,17.5286,78.2674,1,0.0


#### Therefore the above dataframe has the places where there is a good chance for a new theater to be successfull, as there are no theatres within the place as well as in nearby places.

In [37]:
print('Thus theatres in the following places can be sucessfull')
display(list(final_list['Neighborhood']))

Thus theatres in the following places can be sucessfull


['Cherlapally', 'Langar Houz', 'Malkajgiri mandal', 'Patancheru']