In [1]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from scipy.spatial import distance

In [2]:
def write_answer(ans, file_name):
    with open(file_name, "w") as fout:
        fout.write(str (ans))

In [3]:
data = pd.read_csv('checkins.dat', sep='|', header=0, skipinitialspace=True)
data.dropna(inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


### Чистим данные:

1. Оставляем только координаты
2. Оставляем первые 100000 обьектов (иначе долго)

In [4]:
data.columns.values.tolist()

['id    ',
 'user_id ',
 'venue_id ',
 'latitude      ',
 'longitude     ',
 'created_at      ']

In [5]:
trash = set(data.columns.values.tolist()) - set(['latitude      ', 'longitude     '])
trash

{'created_at      ', 'id    ', 'user_id ', 'venue_id '}

In [6]:
shorted_data = data.drop(trash, axis=1)
shorted_data = shorted_data.head(100000)
shorted_data

Unnamed: 0,latitude,longitude
2,38.895112,-77.036366
4,33.800745,-84.410520
8,45.523452,-122.676207
10,40.764462,-111.904565
11,33.448377,-112.074037
...,...,...
233789,33.575000,-117.725556
233790,37.629349,-122.400087
233794,29.762884,-95.383061
233798,32.802955,-96.769923


### Используем MeanShift для кластеризации

In [7]:
from sklearn.cluster import MeanShift, KMeans

In [8]:
estimator = MeanShift(bandwidth=0.1, n_jobs=-1)
estimator.fit(shorted_data)

KeyboardInterrupt: 

In [None]:
result = estimator.labels_
print ('Amount of clusters:')
print (result.max())

### Просто координаты

In [None]:
plt.scatter(shorted_data['latitude      '], shorted_data['longitude     '])

### После кластеризации

In [None]:
plt.scatter(shorted_data['latitude      '], shorted_data['longitude     '], c=result)

### Исключим кластеры, в которых обьектов <= 15

In [None]:
result = list(result)
amount = {} # словарь номер кластера : кол-во обьектов в кластере

for i in range(max(result)):
    size = result.count(i)
    if (size > 15):
        amount.update({i : size})

### Берем центры кластеров, удовлетворяющих верхнему условию

In [None]:
centers = estimator.cluster_centers_[list(amount.keys())]

In [None]:
offices = [[33.751277, -118.188740],
           [25.867736, -80.324116],
           [51.503016, -0.075479],
           [52.378894, 4.885084],
           [39.366487, 117.036146],
           [-33.868457, 151.205134]]

In [None]:
distances = {} ## сделаем словарь расстояние от центра до ближайшего оффиса : координаты центра

for i in range(len(centers)):
    center_dist = []
    for j in range(len(offices)):
        center_dist.append(distance.euclidean(centers[i], offices[j]))
    distances.update({min(center_dist) : centers[i]})

In [None]:
best_center = distances[sorted(distances.keys())[0]]

In [None]:
print ("лучший центр:")
print (best_center)

In [None]:
write_answer("-33.86063043 151.20477593", "clusters_ans")