## Task: Choose 20 optimal locations around the world for placing advertising banners near the company's offices.

In [1]:
import pandas as pd
from sklearn.cluster import MeanShift
import numpy as np

Data: [Foursquare Dataset](https://archive.org/details/201309_foursquare_dataset_umn)

In [2]:
# reading the dataset

checkins0 = pd.read_csv('checkins.dat', sep='|', skipinitialspace=True, low_memory=False, skiprows=[1])
print(checkins0.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1021967 entries, 0 to 1021966
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1021967 non-null  object 
 1   user_id           1021966 non-null  float64
 2   venue_id          1021966 non-null  float64
 3   latitude          396634 non-null   float64
 4   longitude         396634 non-null   float64
 5   created_at        1021966 non-null  object 
dtypes: float64(4), object(2)
memory usage: 46.8+ MB
None


In [3]:
checkins0.head(10)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916.0,5222.0,,,2012-04-21 17:39:01
1,984222,15824.0,5222.0,38.895112,-77.036366,2012-04-21 17:43:47
2,984315,1764391.0,5222.0,,,2012-04-21 17:37:18
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840.0,5222.0,,,2012-04-21 17:42:58
5,984268,2146843.0,5222.0,,,2012-04-21 17:42:38
6,984281,2146846.0,5222.0,,,2012-04-21 17:39:40
7,984291,105054.0,5222.0,45.523452,-122.676207,2012-04-21 17:39:22
8,6651,1338710.0,219703.0,,,2011-12-08 23:11:23
9,984318,2146539.0,5222.0,40.764462,-111.904565,2012-04-21 17:35:46


In [4]:
print('Number of rows:', checkins0.shape[0])

Number of rows: 1021967


In [5]:
# deleting rows with NaN values

checkins = checkins0.dropna()
checkins.head(10)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824.0,5222.0,38.895112,-77.036366,2012-04-21 17:43:47
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
7,984291,105054.0,5222.0,45.523452,-122.676207,2012-04-21 17:39:22
9,984318,2146539.0,5222.0,40.764462,-111.904565,2012-04-21 17:35:46
10,984232,93870.0,380645.0,33.448377,-112.074037,2012-04-21 17:38:18
11,984483,1030290.0,955969.0,32.221743,-110.926479,2012-04-21 17:58:54
12,984685,304253.0,23558.0,40.65,-73.95,2012-04-21 18:19:34
13,984470,720850.0,749715.0,33.448377,-112.074037,2012-04-21 17:02:47
15,984610,1639666.0,442605.0,33.414768,-111.909309,2012-04-21 18:04:58
18,984653,1647192.0,23558.0,42.358431,-71.059773,2012-04-21 18:23:22


In [6]:
# deleting whitespaces

checkins.columns = checkins.columns.str.strip()


print('Number of rows:', checkins.shape[0])

Number of rows: 396634


#### Finding tourist congestion centers using coordinates (clustering problem)

In [7]:
data = checkins[['latitude', 'longitude']] # getting coordinates

print(data)

          latitude   longitude
1        38.895112  -77.036366
3        33.800745  -84.410520
7        45.523452 -122.676207
9        40.764462 -111.904565
10       33.448377 -112.074037
...            ...         ...
1021959  40.850100  -73.866246
1021960  33.748995  -84.387982
1021961  42.765366  -71.467566
1021962  42.439479  -83.743830
1021964  42.331427  -83.045754

[396634 rows x 2 columns]


In [8]:
# creating a subset for clustering

data_sample = data[:100000]
print(data_sample)

         latitude   longitude
1       38.895112  -77.036366
3       33.800745  -84.410520
7       45.523452 -122.676207
9       40.764462 -111.904565
10      33.448377 -112.074037
...           ...         ...
233788  33.575000 -117.725556
233789  37.629349 -122.400087
233793  29.762884  -95.383061
233797  32.802955  -96.769923
233798  37.774929 -122.419415

[100000 rows x 2 columns]


In [9]:
%%time

# clustering using MeanShift algorithm

ms = MeanShift(bandwidth = 0.1)
ms.fit(data_sample)

CPU times: user 6min 37s, sys: 864 ms, total: 6min 37s
Wall time: 6min 39s


MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [10]:
cluster_centers = ms.cluster_centers_

print(cluster_centers)

[[  40.7177164   -73.99183542]
 [  33.44943805 -112.00213969]
 [  33.44638027 -111.90188756]
 ...
 [ -37.8229826   145.1811902 ]
 [ -41.2924945   174.7732353 ]
 [ -45.0311622   168.6626435 ]]


In [11]:
# number of clusters
print(len(cluster_centers))

3231


In [63]:
# sorting clusters by items

labels_unique, counts = np.unique(ms.labels_, return_counts=True)

print(labels_unique)
print(counts)
print()

for i in range(10):
    print('Cluster ', labels_unique[i], ': ', counts[i], ' items', sep='')

[   0    1    2 ... 3228 3229 3230]
[12506  4692  3994 ...     1     1     1]

Cluster 0: 12506 items
Cluster 1: 4692 items
Cluster 2: 3994 items
Cluster 3: 3363 items
Cluster 4: 3526 items
Cluster 5: 2409 items
Cluster 6: 2297 items
Cluster 7: 1601 items
Cluster 8: 1526 items
Cluster 9: 1378 items


etc.

In [64]:
# removing useless clusters

labels_unique_morethan = []

for i, items in enumerate(counts):
    if items > 15:
        labels_unique_morethan.append(labels_unique[i])

print('Number of useful clasters:', len(labels_unique_morethan))
print()
print(labels_unique_morethan)

Number of useful clasters: 592

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 21

In [67]:
centers_useful = np.empty((len(labels_unique_morethan), 2))

for i, labels in enumerate(labels_unique_morethan):
    centers_useful[i] = cluster_centers[labels]

print(centers_useful)

[[  40.7177164   -73.99183542]
 [  33.44943805 -112.00213969]
 [  33.44638027 -111.90188756]
 ...
 [  41.61853175  -88.44556818]
 [  39.2494686   -77.1821271 ]
 [  38.65877915  -76.8856871 ]]


#### Coordinates of offices:


33.751277, -118.188740 (Los Angeles)

25.867736, -80.324116 (Miami)

51.503016, -0.075479 (London)

52.378894, 4.885084 (Amsterdam)

39.366487, 117.036146 (Beijing)

-33.868457, 151.205134 (Sydney)

![offices_on_map.png](https://raw.githubusercontent.com/Lenferdetroud/ipython-notebooks/master/banner%20location/offices_on_map.png)

#### Search for the 20 nearest cluster centers (calculating the distance to the nearest office for each point)

In [68]:
# reading the coordinates

offices = pd.read_csv('offices_coordinates.txt', skipinitialspace=True)
office_coordinates = offices.loc[:, ['latitude', 'longitude']]

office_coordinates

Unnamed: 0,latitude,longitude
0,33.751277,-118.18874
1,25.867736,-80.324116
2,51.503016,-0.075479
3,52.378894,4.885084
4,39.366487,117.036146
5,-33.868457,151.205134


In [75]:
def distance(x, y):
    return np.sqrt(np.sum((x - y)**2))


# calculating distances

nearest_office_d = np.empty(centers_useful.shape[0])

for i, item in enumerate(centers_useful):
    min_dist = distance(item, office_coordinates.loc[0])
    
    for j in range(len(offices)):
        dist = distance(item, office_coordinates.loc[j])
        
        if dist < min_dist:
            min_dist = dist
            
    nearest_office_d[i] = min_dist
    
    
# sorting

dist_sorted_inds = np.argsort(nearest_office_d)
nearest_office_d_sorted = nearest_office_d[dist_sorted_inds]
centers_useful_sorted = centers_useful[dist_sorted_inds]


print('Minimal distance:', nearest_office_d_sorted[0])
print('The closest center:', centers_useful_sorted[0])

Minimal distance: 0.007834758163109155
The closest center: [-33.86063043 151.20477593]


In [76]:
for i in range(20):
    print(centers_useful_sorted[i])

[-33.86063043 151.20477593]
[52.37296399  4.89231722]
[ 25.84567226 -80.3188906 ]
[51.50299126 -0.12553729]
[  33.80987796 -118.14892381]
[ 25.78581242 -80.21793804]
[ 25.70534972 -80.28342874]
[ 26.01009825 -80.19999059]
[  33.88832534 -118.04892817]
[  33.87298601 -118.36209115]
[  33.97257482 -118.16837067]
[ 26.13884379 -80.33434684]
[  33.98393587 -118.00740497]
[ 26.12086266 -80.15890668]
[  33.81730643 -117.89124917]
[  34.06039755 -118.24870903]
[  33.67430266 -117.85878927]
[ 26.20058464 -80.25071613]
[  34.03548695 -118.43899772]
[  34.13146015 -118.11801181]


![world_clusters.png](https://raw.githubusercontent.com/Lenferdetroud/ipython-notebooks/master/banner%20location/world_clusters.png)
![miami_clusters.png](https://raw.githubusercontent.com/Lenferdetroud/ipython-notebooks/master/banner%20location/miami_clusters.png)
![LA_clusters.png](https://raw.githubusercontent.com/Lenferdetroud/ipython-notebooks/master/banner%20location/LA_clusters.png)