In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### Определим географические границы для модели

In [81]:
with open('./data/adm.geojson', 'r') as f:
    adm = json.load(f)
    
adm = pd.DataFrame(adm['features']).drop('type', axis = 1)
boundries = np.array(adm.geometry[0]['coordinates']).reshape(-1,2)

In [32]:
boundries

array([[131.840454,  43.07632 ],
       [131.842182,  43.0817  ],
       [131.846808,  43.084828],
       ...,
       [131.842308,  43.075099],
       [131.843305,  43.072986],
       [131.840454,  43.07632 ]])

In [4]:
lat_bounries = (boundries[:,1].min(), boundries[:,1].max())
long_bounries = (boundries[:,0].min(), boundries[:,0].max())

## Формируем Датасет Buildings из geojson

In [45]:
with open('./data/buildings.geojson', 'r') as f:
    bui = json.load(f)
    
bui = pd.DataFrame(bui['features']).drop('type', axis = 1)
bui.head()

Unnamed: 0,properties,geometry
0,"{'fid': 1, 'osm_id': '81205362', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13..."
1,"{'fid': 2, 'osm_id': '81368179', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13..."
2,"{'fid': 3, 'osm_id': '81370390', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13..."
3,"{'fid': 4, 'osm_id': '82082664', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13..."
4,"{'fid': 5, 'osm_id': '103242384', 'addr:housen...","{'type': 'MultiPolygon', 'coordinates': [[[[13..."


In [46]:
# достаем данные из properties
for key in bui.properties[0].keys():
    bui[key] = bui.properties.apply(lambda x: x[key])
    
# достаем данные из geometry
for key in bui.geometry[0].keys():
    bui[key] = bui.geometry.apply(lambda x: x[key])
    
# объединим адресные признаки в один адрес
bui['address'] = bui['addr:postcode'] + [' '] + bui['addr:street'] + [' '] + bui['addr:housenumber']

In [47]:
bui.head(3)

Unnamed: 0,properties,geometry,fid,osm_id,addr:housenumber,addr:postcode,addr:street,building,addr:housename,amenity_place,floors,population,type,coordinates,address
0,"{'fid': 1, 'osm_id': '81205362', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13...",1,81205362,37А,690037.0,улица Адмирала Юмашева,yes,,marketplace,2,,MultiPolygon,"[[[[131.9576618, 43.1252596], [131.9576572, 43...",690037 улица Адмирала Юмашева 37А
1,"{'fid': 2, 'osm_id': '81368179', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13...",2,81368179,,,,yes,,marketplace,1,,MultiPolygon,"[[[[131.9581768, 43.1254291], [131.9581886, 43...",
2,"{'fid': 3, 'osm_id': '81370390', 'addr:housenu...","{'type': 'MultiPolygon', 'coordinates': [[[[13...",3,81370390,125,,улица Нейбута,office,,marketplace,3,,MultiPolygon,"[[[[131.9637168, 43.1231396], [131.9638559, 43...",улица Нейбута 125


In [48]:
# подправим 3 изначально кривые записи

for i in [2278,8410,8539]:
    bui.coordinates[i] = np.vstack(bui.coordinates[i][0])
    
# преобразуем многомерный список координат в удобный для работы 2мерный

bui.coordinates = bui.coordinates.apply(lambda x: np.array(list(zip(np.array(x).flatten()[1::2],
np.array(x).flatten()[0::2]))))

# определим 'средние' координаты каждого строения по списку координат полигона

bui['lat'] = bui.coordinates.apply(lambda x: x[:,0].astype(float).mean())
bui['long'] = bui.coordinates.apply(lambda x: x[:,1].astype(float).mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bui.coordinates[i] = np.vstack(bui.coordinates[i][0])


In [53]:
# убираем строения выходящие за границы

bui = bui[(bui.lat >= lat_bounries[0]) & (bui.lat <= lat_bounries[1]) &\
    (bui.long >= long_bounries[0]) & (bui.long <= long_bounries[1])]

In [54]:
# итоговый датасет с домами и населением Владивостока

bui = bui[['amenity_place', 'floors', 'population', 'address', 'lat', 'long']]
bui

Unnamed: 0,amenity_place,floors,population,address,lat,long
0,marketplace,2,,690037 улица Адмирала Юмашева 37А,43.125287,131.957594
1,marketplace,1,,,43.125417,131.958202
2,marketplace,3,,улица Нейбута 125,43.123237,131.963757
3,marketplace,1,,,43.125295,131.957713
4,marketplace,2,,улица Сабанеева 22А,43.128502,131.956351
...,...,...,...,...,...,...
60897,,,,улица Ключ 4-й 17,43.281678,132.077552
60898,,,,улица Ключ 4-й 10А,43.280893,132.078094
60899,,2,,улица Ключ 4-й 2,43.281579,132.075125
60900,,,,улица Ключ 4-й 5А,43.281906,132.075485


In [337]:
bui.to_csv('_buildings.csv', index = False)

## Формируем Датасет Points of Interest из geojson

In [56]:
with open('./data/POI.geojson', 'r') as f:
    data = json.load(f)
    
poi = pd.DataFrame(data['features']).drop('type', axis = 1)
poi.head()

Unnamed: 0,properties,geometry
0,"{'osm_id': '261715276', 'amenity': 'bus_statio...","{'type': 'Point', 'coordinates': [131.9056239,..."
1,"{'osm_id': '349352422', 'amenity': 'customs', ...","{'type': 'Point', 'coordinates': [131.867301, ..."
2,"{'osm_id': '440605593', 'amenity': 'atm', 'nam...","{'type': 'Point', 'coordinates': [131.8792176,..."
3,"{'osm_id': '441700945', 'amenity': 'cinema', '...","{'type': 'Point', 'coordinates': [131.8987251,..."
4,"{'osm_id': '441729872', 'amenity': 'ferry_term...","{'type': 'Point', 'coordinates': [131.8841144,..."


In [57]:
# достаем данные из properties

for key in poi.properties[0].keys():
    poi[key] = poi.properties.apply(lambda x: x[key])
    
# достаем данные из geometry

for key in poi.geometry[0].keys():
    poi[key] = poi.geometry.apply(lambda x: x[key])
    
poi = poi.drop(['type', 'properties', 'geometry'], axis =1)
poi['lat'] = poi.coordinates.apply(lambda x: x[1])
poi['long'] = poi.coordinates.apply(lambda x: x[0])

In [58]:
poi = poi[['lat', 'long', 'amenity', 'name', 'description', 'operator', 'brand', 'brand:wikipedia']]
poi.head()

Unnamed: 0,lat,long,amenity,name,description,operator,brand,brand:wikipedia
0,43.164242,131.905624,bus_station,Автовокзал,,,,
1,43.096901,131.867301,customs,Vladivostok customs,Владивостокский таможенный пост,,,
2,43.116883,131.879218,atm,,,МДМ Банк,,
3,43.133078,131.898725,cinema,Москва,,,,
4,43.112196,131.884114,ferry_terminal,Вокзал прибрежных сообщений,,,,


## Посмотрим, что можно использовать и добавить в данные

In [59]:
poi.amenity.value_counts()

atm                   250
cafe                  224
bench                 222
pharmacy              186
waste_disposal        140
bank                  131
fast_food             126
waste_basket          126
car_wash               82
fuel                   79
toilets                76
restaurant             66
dentist                65
post_office            54
doctors                41
bar                    34
hospital               29
clinic                 28
library                20
community_centre       18
post_box               16
police                 15
vending_machine        14
kindergarten           14
pub                    13
veterinary             13
fountain               11
driving_school          9
school                  9
nightclub               8
car_rental              8
place_of_worship        8
parking_entrance        8
drinking_water          8
vehicle_ramp            8
recycling               7
bbq                     6
telephone               6
shelter     

In [60]:
# объединим категории объектов по смыслу

medics = ['doctors', 'dentist', 'hospital', 'clinic']
education = ['library', 'kindergarten', 'school', 'driving_school', 'music_school']
facility = ['waste_basket', 'bench', 'waste_disposal', 'toilets', 'bbq', 'drinking_water', 'recycling', 'shelter']
heduction = ['university', 'college']
alco = ['bar', 'pub']
goverment = ['customs', 'embassy', 'townhall', 'fire_station', 'courthouse', 'public_building', 'police']
culture = ['cinema', 'arts_centre', 'theatre']

In [61]:
def amenity(value):
    if value in medics: return 'medics'
    elif value in education: return 'education'
    elif value in facility: return 'facility'
    elif value in heduction: return 'heduction'
    elif value in alco: return 'alco'
    elif value in goverment: return 'goverment'
    elif value in culture: return 'culture'
    else: return value

In [62]:
poi.amenity = poi.amenity.apply(amenity)

In [63]:
# уберем из POI редкие типы точек, влияние которых мы никак не сможем оценить из-за малого количества

top_poi = poi.amenity.value_counts()[poi.amenity.value_counts() > 3].index.tolist()
top_poi

['facility',
 'atm',
 'cafe',
 'pharmacy',
 'medics',
 'bank',
 'fast_food',
 'car_wash',
 'fuel',
 'restaurant',
 'post_office',
 'education',
 'alco',
 'goverment',
 'community_centre',
 'post_box',
 'vending_machine',
 'veterinary',
 'fountain',
 'nightclub',
 'culture',
 'place_of_worship',
 'car_rental',
 'vehicle_ramp',
 'parking_entrance',
 'telephone',
 'heduction',
 'boat_storage']

In [65]:
poi = poi[poi['amenity'].isin(top_poi)]
poi.shape

(2217, 8)

## Добавим спарсенные данные по службам доставки, постоматам и магазинам

In [66]:
# SDEK

sdek_adr = pd.read_csv('./data/parsed_sdek_adr.csv')
sdek_adr = sdek_adr[['displayLatitude','displayLongitude']]

In [67]:
sdek_adr.columns = ['lat', 'long']

sdek_adr['amenity'] = 'sdek'
sdek_adr['name'] = 'sdek'
sdek_adr

Unnamed: 0,lat,long,amenity,name
0,43.14638,131.93123,sdek,sdek
1,43.12824,131.89668,sdek,sdek
2,43.10026,131.93005,sdek,sdek
3,43.13618,131.92991,sdek,sdek
4,43.09581,131.97493,sdek,sdek
5,43.12748,131.92319,sdek,sdek
6,43.15151,131.90988,sdek,sdek
7,43.10333,131.89748,sdek,sdek
8,43.17431,131.91054,sdek,sdek
9,43.12896,131.93323,sdek,sdek


In [68]:
# Постаматы и ПВЗ

postamats = pd.read_csv('./data/parsed_postamats.csv')

postamats['amenity'] = postamats['PT_Name'].apply(lambda x: x.split(': ')[0])
postamats['name'] = postamats['PT_Name'].apply(lambda x: x.split(': ')[1])

postamats = postamats[['latitude','longitude','amenity','name']]
postamats.columns = ['lat','long','amenity','name']

In [69]:
postamats.amenity = postamats.amenity.replace({'Постамат': 'postamat', 'ПВЗ': 'pvz'})
postamats.lat = postamats.lat.apply(lambda x: x.replace(',', '.'))
postamats.long = postamats.long.apply(lambda x: x.replace(',', '.'))
postamats

Unnamed: 0,lat,long,amenity,name
0,43.119228,131.921238,postamat,QIWI
1,43.129519,131.893219,postamat,QIWI
2,43.149202,131.909916,postamat,QIWI
3,43.158051,131.918594,postamat,PickPoint
4,43.126597,131.949569,postamat,QIWI
5,43.089305,131.860862,postamat,QIWI
6,43.212708,131.951079,postamat,QIWI
7,43.114214,131.959787,postamat,QIWI
8,43.124171,131.905115,postamat,QIWI
9,43.217021,131.956138,postamat,QIWI


In [76]:
# Магазины Владивостока

grocery = pd.read_csv('./data/grocery.csv')

In [77]:
grocery = grocery[['lat', 'long', 'amenity', 'name', 'description', 'operator', 'brand', 'brand:wikipedia']]
grocery.amenity = grocery.amenity.apply(lambda x: 'grocery')
grocery

Unnamed: 0,lat,long,amenity,name,description,operator,brand,brand:wikipedia
0,43.151530,131.955804,grocery,Светофор,Светофор,,магазин,
1,43.127140,131.906364,grocery,Реми,Реми,,сеть продовольственных супермаркетов,
2,43.097485,131.958781,grocery,Реми,Реми,,сеть продовольственных супермаркетов,
3,43.164010,131.920743,grocery,Реми,Реми,,сеть продовольственных супермаркетов,
4,43.089582,131.860868,grocery,Самбери,Самбери,,сеть гипермаркетов,
...,...,...,...,...,...,...,...,...
226,43.083021,131.853045,grocery,Минимаркет,,Минимаркет,,
227,43.176588,131.917087,grocery,Моя мечта,Моя мечта,,мини-маркет,
228,43.178099,131.931135,grocery,Мини-маркет,,Мини-маркет,,
229,43.188284,131.914210,grocery,Эклетика,Эклетика,,мини-маркет,


### Объединим датасеты в один

In [78]:
total_poi = pd.concat((poi, grocery, sdek_adr, postamats), axis = 0, ignore_index = True)

In [79]:
total_poi.lat = total_poi.lat.astype(float)
total_poi.long = total_poi.long.astype(float)

In [80]:
# границы 

total_poi = total_poi[(total_poi.lat >= lat_bounries[0]) & (total_poi.lat <= lat_bounries[1]) &\
    (total_poi.long >= long_bounries[0]) & (total_poi.long <= long_bounries[1])]

In [364]:
total_poi.to_csv('_total_poi.csv', index = False)