In 2011, URL shortening service Bitly partnered with the US government website
USA.gov to provide a feed of anonymous data gathered from users who shorten links
ending with .gov or .mil.

![](bitly_short_urls.jpg)

In [None]:
from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20

In [1]:
# Converting json into string
import json
path = 'dataset/example.txt'
records = [json.loads(line) for line in open(path)]

In [2]:
len(records)

3560

In [3]:
records[0] # Looking at an example

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'c': 'US',
 'nk': 1,
 'tz': 'America/New_York',
 'gr': 'MA',
 'g': 'A6qOVH',
 'h': 'wfLQtf',
 'l': 'orofrog',
 'al': 'en-US,en;q=0.8',
 'hh': '1.usa.gov',
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
 't': 1331923247,
 'hc': 1331822918,
 'cy': 'Danvers',
 'll': [42.576698, -70.954903]}

### 1.What is the most occuring time zone in the dataset?

In [4]:
# The tz key stands for timezone
time_zones = [record['tz'] for record in records if 'tz' in record]

In [5]:
time_zones[:10] 

['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']

In [6]:
# 1. Method: Using standard Python
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [7]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] += 1
    return counts

In [8]:
counts = get_counts(time_zones)

In [9]:
len(time_zones)

3440

In [10]:
counts['America/New_York']

1251

In [11]:
def top_counts(count_dict, n = 10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [12]:
top_counts(counts)

[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

In [13]:
from collections import Counter

In [14]:
counts = Counter(time_zones)

In [15]:
counts.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

In [16]:
# 2. Method: Using Pandas
df = pd.DataFrame(records)
df.info()

NameError: name 'pd' is not defined

In [None]:
df['tz'][:10]

In [None]:
tz_counts = df['tz'].value_counts()

In [None]:
tz_counts[:10]

In [None]:
clean_tz = df['tz'].fillna('Missing')

In [None]:
clean_tz[clean_tz == ''] = 'Unknown'

In [None]:
tz_counts = clean_tz.value_counts()

In [None]:
tz_counts[:10]

In [None]:
import seaborn as sns
subset = tz_counts[:10]

In [None]:
sns.barplot(y = subset.index, x = subset.values)

Most occuring timezone is America/New York. The second occuring timezone is called 'Unknown'. If 'Unknown' and 'Missing' categories are not taken into account, American timezones are at the top 3 and Europe/London follows.

### 2. Which application was mostly used in this dataset?

In [None]:
# The 'a' column represents the browserm device or application to perform the URL shortening
df['a'][0]

In [None]:
results = pd.Series([x.split()[0] for x in df.a.dropna()])

In [None]:
results[:10]

In [None]:
results.value_counts()[:10]

Mozilla is the mostly used application to shorten the URLS in the dataset. There are 3 different versions of Mozilla in this dataset and 2 of them are at the top 2 of the list, followed by Google Maps. At the 4th rank, another popular browser Opera is present.

### 3. How many people use Windows in the top time zones?

In [None]:
clean_df = df[df.a.notnull()]

In [None]:
clean_df['os'] = np.where(clean_df['a'].str.contains('Windows'), 'Windows', 'Not Windows')

In [None]:
clean_df['os'][:5]

In [None]:
by_tz_os = clean_df.groupby(['tz', 'os'])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [None]:
agg_counts[:10]

In [None]:
indexer = agg_counts.sum(1).argsort()

In [None]:
indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer[-10:])

In [None]:
count_subset

In [None]:
# Pandas has nlargest() for this task
agg_counts.sum(1).nlargest(10)

In [None]:
# Rearrange the data for plotting
count_subset = count_subset.stack()

In [None]:
count_subset.name = 'total'

In [None]:
count_subset = count_subset.reset_index()

In [None]:
count_subset[:10]

In [None]:
sns.barplot(x ='total', y = 'tz', hue = 'os', data = count_subset)

            Top time zones by Windows and non-Windows users

The plot doesn’t make it easy to see the relative percentage of Windows users in the
smaller groups, so let’s normalize the group percentages to sum to 1:

In [None]:
def norm_total(group):
    group['normed_total'] = group.total / group.total.sum()
    return group

In [None]:
results = count_subset.groupby('tz').apply(norm_total)

In [None]:
sns.barplot(x ='normed_total', y = 'tz', hue = 'os', data = results)