In [48]:
# Import any dependencies 
import os, os.path, json 

In [50]:
# Path for the data
path = 'example.txt'

In [52]:
# Preview the data 
open(path).readline()

'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [54]:
# Utilize the json module and loads function to convert the JSON file into a 
# python dictionary object
records = [json.loads(line) for line in open(path)]

In [56]:
# Preview the data as a list of python dictionaries 
records[0]

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'c': 'US',
 'nk': 1,
 'tz': 'America/New_York',
 'gr': 'MA',
 'g': 'A6qOVH',
 'h': 'wfLQtf',
 'l': 'orofrog',
 'al': 'en-US,en;q=0.8',
 'hh': '1.usa.gov',
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
 't': 1331923247,
 'hc': 1331822918,
 'cy': 'Danvers',
 'll': [42.576698, -70.954903]}

In [58]:
# Continue endevour of comparing counting timezones in pure python. First find
# the most common timezones in the dataset.
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [60]:
# Preview the data 
time_zones[:10]

['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']

In [62]:
# Continue manipulating the data by filtering out empty strings. We will begin 
# the analysis by counting sequences. This is by far the hard way, proper 
# methodology would have been doing this through pandas. This uses only 
# Python's standard libraries. We start by creating a dictionary to store the 
# count as we iterate though the sequence.
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [64]:
counts = get_counts(time_zones)

In [66]:
# Preview the data 
counts['America/New_York']

1251

In [68]:
len(time_zones)

3440

In [70]:
# Find the most popular timezones and their usage frequency. We will create a 
# list of tuples where each tuple holds (count, tz). We then sort the list of
# tuples based on the first element (count). I return the last n items from 
# the sorted list (highest counts)
def top_counts(count_dict, n=10): 
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [72]:
# Preview Data
top_counts(counts)

[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

In [74]:
# Conversely an easier way to perform the same action is to use the 
# collections.Counter class
from collections import Counter 

In [76]:
counts = Counter(time_zones)

In [78]:
counts.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]