# Helps GUI in clustering the venues and returning the most visited venues

In [1]:
#importing libraries
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
# tranforming json file into a pandas dataframe library
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans 
import folium # plotting library
import geocoder

#function to get the co-ordinates of the location
def get_location(location):
    #getting the co-ordinates of the location
    geolocator = Nominatim(user_agent="uk")
    address = str(location +", United Kingdom")
    location = geocoder.arcgis(address)
    latlang = location.latlng
    return latlang

#function to import data of neighborhoods
def import_dataset(location):
    loc_neighborhoods = pd.read_excel(r'C:/Users/hp/dissertation/Datasets/top25.xlsx',sheet_name='Sheet2')
    loc_neighborhoods=loc_neighborhoods[loc_neighborhoods.city==location]
    loc_neighborhoods.drop('city',inplace=True,axis=1)
    return loc_neighborhoods

#function to get co-ordinates of the neighborhoods
def getNeighborhoodLatLong(Neighborhoods,loc):
    #Neighborhood with lat , long
    latlang = []
    # defining the user agent for geopy
    geolocator = Nominatim(user_agent="uk_neighborhoods")
    # for every unique  of Neighborhood  get the lattitude and longitude
    for  neigh in Neighborhoods:
        address = str(neigh +", {}".format(loc))
        location = geocoder.arcgis(address)
        latlang.append(
        location.latlng
        )
    
    return(latlang)

#function to add coordinates to the dataframe
def add_coordinates(loc_neigh,loc_neighborhoods):
    latitude= []
    longitude = []
    for lat, lng in loc_neigh:
        latitude.append(lat)
        longitude.append(lng)
    loc_neighborhoods["Latitude"] = latitude
    loc_neighborhoods["Longitude"] = longitude
    return loc_neighborhoods

#function to call API to get most visitied venues
def api_call(names, latitudes, longitudes, radius=500):
    LIMIT = 50
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = "https://api.foursquare.com/v3/places/search?ll={}%2C{}&limit={}&radius={}".format(lat,lng,LIMIT,radius)
        headers = {"Accept": "application/json","Authorization": "fsq3CC6RlfzjF1d+c0TGaiHO5YpQFJa7aZDgdKUzdoldVYc="}

        # make the GET request
        results = requests.get(url, headers=headers).json()
        # return only relevant information for each nearby venue
        vlist =[]
        for i in range(len(results['results'])):
            if len(results['results'][i]['categories'])>0:
                vlist.append((results['results'][i]['name'],
                           results['results'][i]['categories'][0]['name']
                           ,results['results'][i]['geocodes']['main']['latitude'],
                           results['results'][i]['geocodes']['main']['longitude']))
        
        venues_list.append([
            (name, lat, lng, v[0], v[2],v[3],v[1]) for v in vlist])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
        'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']

    return (nearby_venues)

#function to check if the neighborhoods are listed in the neighborhood data
def venues_check(loc_neighborhoods,loc_venues):
    loc_neighborhoods = loc_neighborhoods.loc[loc_neighborhoods["neighborhood"].isin(loc_venues["Neighborhood"])]
    return loc_neighborhoods

#function to apply one hot encoding to the dataset 
def onehot(loc_venues):
    # one hot encoding
    loc_onehot = pd.get_dummies(loc_venues[['Venue Category']], prefix="", prefix_sep="")
    # add neighborhood column back to dataframe
    loc_onehot['Neighborhood'] = loc_venues['Neighborhood'] 
    loc_grouped = loc_onehot.groupby('Neighborhood').mean().reset_index()
    return loc_grouped

#function to return most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

#function to append the data into the dataframe
def most_common(loc_grouped):
    num_top_venues = 10
    indicators = ['st', 'nd', 'rd']
    # create columns according to number of top venues
    columns = ['Neighborhood']
    for ind in np.arange(num_top_venues):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))
    # create a new dataframe
    neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
    neighborhoods_venues_sorted['Neighborhood'] = loc_grouped['Neighborhood']
    for ind in np.arange(loc_grouped.shape[0]):
        neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(loc_grouped.iloc[ind, :], num_top_venues)
    return neighborhoods_venues_sorted


#function to apply kmeans clustering on the dataset
def model(loc_grouped,neighborhoods_venues_sorted,loc_neighborhoods):
    
    kclusters = 4
    loc_grouped_clustering = loc_grouped.drop('Neighborhood', 1)
    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(loc_grouped_clustering)
    #add clustering labels
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
    loc_merged = loc_neighborhoods
    # merge loc_grouped with loc_data to add latitude/longitude for each neighborhood
    loc_merged = loc_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='neighborhood')
    return loc_merged

#function to call all previous functions together
def main_function(location):
    latlang = get_location(location)
    loc_neighborhoods = import_dataset(location)
    loc_neigh = getNeighborhoodLatLong(Neighborhoods = loc_neighborhoods["neighborhood"],loc=location)
    loc_neighborhoods = add_coordinates(loc_neigh,loc_neighborhoods)
    loc_venues = api_call(names=loc_neighborhoods['neighborhood'],latitudes=loc_neighborhoods['Latitude'],
                                    longitudes=loc_neighborhoods['Longitude'])
    loc_neighborhoods =venues_check(loc_neighborhoods,loc_venues)
    loc_grouped = onehot(loc_venues)
    neighborhoods_venues_sorted = most_common(loc_grouped)
    loc_merged =model(loc_grouped,neighborhoods_venues_sorted,loc_neighborhoods)
    return loc_merged

#function to return the most common venue of particular cluster
def most_venues(df):
    df_tmp =pd.DataFrame(df.groupby("neighborhood")["1st Most Common Venue"].apply(lambda x: ','.join(x.unique())).reset_index())
    df_tmp.rename(columns={'neighborhood':'neighborhood' ,'top_venues':'1st Most Common Venue'})
    venues = df_tmp["1st Most Common Venue"].values
    counter ={}
    if len(venues)==0:
        return None
    else:
        for ven in venues:
            if ven in counter:
                counter[ven]=counter[ven] +1
            else:
                counter[ven]=0
        max_val = list(counter.values())
        max_ke = list(counter.keys())
        return max_ke[max_val.index(max(max_val))]

