Places
===
This is a script to extract, gather, and analyze data from visites places.

In [None]:
%load_ext autoreload
%autoreload 2

import logging
logging.basicConfig(format='%(asctime)s|%(levelname)-5s| %(message)s', level=logging.INFO)

import pandas as pd
import datetime
import glob
from geopy import distance
from geopy.geocoders import Nominatim

import places_api

Read in the KML file
--

In [None]:
df = places_api.read_file('data/geodata.kml')

Read in the pickle file
--
(already processed DataFrame)

In [None]:
df_save = pd.read_pickle('data/places.pkl')
df = df_save

Process the DataFrame
--

In [None]:
df = places_api.process(df)
df

Fix the data reading in part
--

In [None]:
import codecs
from lxml import html
file_name = 'data/geodata.kml'

# read in the file with UTF-8 encoding
logging.info('Reading file {}'.format(file_name))
with codecs.open(file_name, encoding='utf-8') as f:
    kml = f.read()

# decode and re-encode to UTF-8 after doing some replacements
logging.info('Cleaning KML')
kml = kml.encode('utf-8')

# read the document as HTML/XML
logging.info('Converting to HTML')
doc = html.fromstring(kml)

# create the DataFrame and the coounter
df = pd.DataFrame(columns=['name', 'timestamp', 'color', 'coords_long', 'coords_lat',
    'category', 'icon'])
i = 0

# parse the file for some relevant information
logging.info('Parsing out placemarks')
for placemark in doc.cssselect('Document Placemark'):
    # parse out the main fields
    name = placemark.cssselect('name')[0].text

Fix the reverse geocoding part
--

In [None]:
df[df['category'].str.contains('place-city')]

In [None]:
df.loc[:, 'city_Nominatim'] = None
key_list = ['city', 'town', 'village', 'hamlet', 'suburb']
for key in key_list:
    no_city_rows = df['city_Nominatim'].isnull()
    df.loc[no_city_rows, 'city_Nominatim'] = df[no_city_rows]['location_Nominatim'].apply(
        lambda location: location.raw.get('address').get(key))

df.loc[:, 'state_Nominatim'] = None
key_list = ['county', 'city_district', 'state_district', 'state', 'attraction']
for key in key_list:
    no_city_rows = df['state_Nominatim'].isnull()
    df.loc[no_city_rows, 'state_Nominatim'] = df[no_city_rows]['location_Nominatim'].apply(
        lambda location: location.raw.get('address').get(key))

In [None]:
df.groupby(df['city_Nominatim'].ffill()'city_Nominatim')['name'].count().sort_values()

In [None]:
len(df)