## 1. usa.gov data from bit.ly

In [54]:
import json

path = '/home/flexai/git/pydata-book/datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path)]

FileNotFoundError: [Errno 2] No such file or directory: '/home/flexai/git/pydata-book/datasets/bitly_usagov/example.txt'

In [55]:
records[0]

NameError: name 'records' is not defined

In [None]:
records[0]['tz']

In [None]:
print(records[0]['tz'])

### Counting Time Zones with pure python

In [None]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [None]:
time_zones[:10]

In [None]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [None]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

In [None]:
counts = get_counts(time_zones)
counts['America/New_York']

In [None]:
len(time_zones)

In [None]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [None]:
top_counts(counts)

In [None]:
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)

### Counting Time Zones with pandas


In [None]:
from pandas import DataFrame, Series
import pandas as pd

frame = DataFrame(records)
frame

In [None]:
frame['tz'][:10]

In [None]:
tz_counts = frame['tz'].value_counts()

In [None]:
tz_counts[:10]

In [None]:
# Fill missing (NA) values with 'Missing'
clean_tz = frame['tz'].fillna('Missing')

# Replace unknown (empty string) values by boolean array indexing
clean_tz[clean_tz == ''] = 'Unknown'

In [None]:
tz_counts = clean_tz.value_counts()

tz_counts[:10]

In [None]:
%matplotlib inline 
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

tz_counts[:10].plot(kind='barh', rot=0)

In [None]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]

In [None]:
results.value_counts()[:8]

In [None]:
cframe = frame[frame.a.notnull()]

In [None]:
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows', 'Not Windows')
operating_system[:5]

In [None]:
by_tz_os = cframe.groupby(['tz', operating_system])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)

agg_counts[:10]

In [None]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()

indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer)[-10:]

count_subset

In [None]:
count_subset.plot(kind='barh', stacked=True)

In [None]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)

normed_subset.plot(kind='barh', stacked=True)

## 2. MovieLens 1M Data Set

In [None]:
import pandas as pd

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('/home/flexai/git/pydata-book/datasets/movielens/users.dat', sep='::',
                      header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/home/flexai/git/pydata-book/datasets/movielens/ratings.dat', sep='::',
                        header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('/home/flexai/git/pydata-book/datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)

In [None]:
users[:5]

In [None]:
ratings[:5]

In [None]:
movies[:5]

In [None]:
ratings.info()

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)
data.info()

In [None]:
data.ix[0]

In [None]:
mean_ratings = data.pivot_table(values='rating', index="title", columns='gender', aggfunc='mean')
mean_ratings[:5]

In [None]:
ratings_by_title = data.groupby('title').size()

ratings_by_title[:10]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

active_titles

In [None]:
mean_ratings = mean_ratings.ix[active_titles]

mean_ratings

In [None]:
top_females_ratings = mean_ratings.sort_index(by='F', ascending=False)

top_females_ratings[:10]

### Measuring rating disagreement

In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]

In [None]:
sorted_by_diff[::-1][:15]

In [None]:
# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()

In [None]:
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.ix[active_titles]

In [None]:
# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]

## 3. US Baby Names 1800-2010

In [None]:
import pandas as pd

yob1880 = '/home/flexai/git/pydata-book/datasets/babynames/yob1880.txt'
names1880 = pd.read_csv(yob1880, names=['name', 'sex', 'births'])

names1880.info()

In [None]:
names1880.groupby('sex').births.sum()

In [None]:
# 2010 is the last available year right now
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = '/home/flexai/git/pydata-book/datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    
    frame['year'] = year
    pieces.append(frame)
    
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

names.info()

In [None]:
total_births = names.pivot_table(values='births', index='year', columns='sex', aggfunc=sum)
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and year')

In [None]:
def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)
    
    group['prop'] = births / births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)
names.info()

In [None]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [None]:
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [None]:
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
newtop1000 = pd.concat(pieces, ignore_index=True)

In [None]:
top1000.info()

In [None]:
newtop1000.info()

### Analyzing Naming Trends

In [None]:
boys = top1000[top1000.sex == 'M']

In [None]:
girls = top1000[top1000.sex == 'F']

In [None]:
total_births = top1000.pivot_table(values='births', index='year', columns='name',
                                  aggfunc=sum)

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

In [None]:
subset.plot(subplots=True, figsize=(12,10), grid=False, title='Number of births per year')

### Measujring the increase in naming diversity

In [None]:
table = top1000.pivot_table(values='prop', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of top1000.prop by year and sex',
          yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

In [None]:
df = boys[boys.year == 2010]
df.info()

In [None]:
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]

In [None]:
prop_cumsum.searchsorted(0.5)

In [None]:
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()

In [None]:
in1900.searchsorted(0.5) + 1

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return int(group.prop.cumsum().searchsorted(q) + 1)

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [None]:
diversity.head()

In [None]:
diversity.plot(title='Number of popular names in top 50%')

### The "Last letter" Revolution

In [None]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table(values='births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)

In [None]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable / subtable.sum().astype(float)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 10))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)

In [None]:
letter_prop = table / table.sum().astype(float)

In [None]:
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()

In [None]:
dny_ts.plot()

### Boy names that became girl names (and vice versa)

In [None]:
all_names = top1000.name.unique()

In [None]:
mask = np.array(['lesl' in x.lower() for x in all_names])

In [None]:
lesley_like = all_names[mask]
lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [None]:
table = filtered.pivot_table(values='births', index='year', columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'})

## NumPy Basics: Arrays and Vectorized Computation
### The NumPy ndarray: A Multidimensional Array Object
### Creating ndarrays

In [56]:
import numpy as np

data1 = [6, 7.5, 9, 0, 1]

arr1 = np.array(data1)

arr1

array([6. , 7.5, 9. , 0. , 1. ])

In [57]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]

arr2 = np.array(data2)

arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [58]:
arr2.ndim

2

In [59]:
arr2.shape

(2, 4)

In [60]:
arr1.dtype

dtype('float64')

In [61]:
arr2.dtype

dtype('int64')

In [62]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [63]:
np.zeros((3, 6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [64]:
np.empty((2, 3, 2))

array([[[0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000]],

       [[0.00000000e+000, 0.00000000e+000],
        [0.00000000e+000, 2.14321575e-312],
        [3.11108884e+231, 1.49166815e-154]]])

In [65]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [66]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [67]:
np.identity(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

### Data Types for ndarrays

In [68]:
arr1 = np.array([1, 2,3], dtype=np.float64)

In [69]:
arr2 = np.array([1, 2, 3], dtype=np.int32)

In [70]:
arr1.dtype

dtype('float64')

In [71]:
arr2.dtype

dtype('int32')

In [72]:
arr = np.array([1, 2, 3, 4, 5])

In [73]:
arr.dtype

dtype('int64')

In [74]:
float_arr = arr.astype(np.float64)

In [75]:
float_arr.dtype

dtype('float64')

In [76]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])

In [77]:
arr

array([ 3.7, -1.2, -2.6,  0.5, 12.9, 10.1])

In [78]:
arr.astype(np.int32)

array([ 3, -1, -2,  0, 12, 10], dtype=int32)

In [79]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)

In [80]:
numeric_strings.astype(float)

array([ 1.25, -9.6 , 42.  ])

In [81]:
int_array = np.arange(10)

In [82]:
calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)

In [83]:
int_array.astype(calibers.dtype)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [84]:
empty_uint32 = np.empty(8, dtype='u4')

In [85]:
empty_uint32

array([         0, 1075314688,          0, 1075707904,          0,
       1075970048,          0, 1072693248], dtype=uint32)

### Operations between Arrays and Scalars
Arrays enable us to express batch operations on data without writing any for loops (i.e. vectorization). 

In [86]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])

In [87]:
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [88]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [89]:
arr - arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [90]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [91]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

### Basic Indexing and Slicing

In [92]:
arr = np.arange(10)

In [93]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [94]:
arr[5]

5

In [95]:
arr[5:8]

array([5, 6, 7])

In [96]:
# broadcast value to a slice
arr[5:8] = 12

In [97]:
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

A distinction from lists is that array slices are views on the original array. Hence data is not copied, and any modifications to the view will be reflected in the source array.

In [98]:
arr_slice = arr[5:8]

In [99]:
arr_slice[1] = 12345

In [100]:
arr

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,
           9])

In [101]:
arr_slice[:] = 64

In [102]:
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

If you want a copy of a slice of an ndarray instead of a view, you will need to explicitly copy the array

In [103]:
arr_copy = arr[5:8].copy()

In [104]:
arr_copy

array([64, 64, 64])

In [105]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

In [106]:
arr2d[2]

array([7, 8, 9])

In [107]:
arr2d[0][2]

3

In [108]:
arr2d[0, 2]

3

In [109]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])

In [110]:
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [111]:
arr3d[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [113]:
old_values = arr3d[0].copy()

In [115]:
arr3d[0] = 42

In [116]:
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [118]:
arr3d[0] = old_values

In [119]:
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [120]:
arr3d[1, 0]

array([7, 8, 9])

### Indexing with slices

In [121]:
arr[1:6]

array([ 1,  2,  3,  4, 64])

In [122]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [123]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [124]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [125]:
arr2d[1, :2]

array([4, 5])

In [126]:
arr2d[2, :1]

array([7])

In [127]:
arr2d[:, :1]

array([[1],
       [4],
       [7]])

In [129]:
arr2d[:2, 1:] = 0

In [132]:
arr2d

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

### Boolean Indexing

In [131]:
from numpy.random import randn

names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

In [133]:
data = randn(7, 4)

In [134]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [135]:
data

array([[ 1.36816542, -2.04510293, -0.56973559,  0.48482535],
       [ 0.01670871,  1.02761913, -0.3751779 , -0.37707249],
       [ 1.16167358, -0.71551104, -0.58925861, -0.08893398],
       [-0.60103238, -1.01889984,  0.45217737, -0.98327556],
       [ 0.964278  ,  0.41941654,  0.65318846,  0.74094907],
       [ 1.90438504,  0.94566017, -1.41388898,  1.50888196],
       [ 3.01249206,  1.46134296,  1.33014175, -0.63798973]])