In [68]:
import pandas as pd

# Pandas Series Object

In [69]:
# Pandas Series wraps a sequence of values and indices into a 1d array of indexed data
# creating series from a list:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [70]:
# values are a numpy array
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [71]:
# index is an array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [72]:
print(data[1])
print(data[1:3])

0.5
1    0.50
2    0.75
dtype: float64


In [73]:
# index doesn't need to be int, we can use strings
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [74]:
data['b']

0.5

In [75]:
# using nonsequential indices
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=[1, 3, 5, 7])
data[5]

0.75

In [76]:
# series is like a dictionary, but with typed indices and values for efficiency
population_dict = {'CA': 38332521,
                   'TX': 26448193,
                   'NY': 19651127,
                   'FL': 19552860,
                   'IL': 12882135}
population = pd.Series(population_dict)
population

CA    38332521
FL    19552860
IL    12882135
NY    19651127
TX    26448193
dtype: int64

In [77]:
population['CA']

38332521

In [78]:
# unlike dict, Series supports array-style operations like slicing
population['CA':'IL']

CA    38332521
FL    19552860
IL    12882135
dtype: int64

In [79]:
# constructing Series objects

In [80]:
# from a list or NP array, where it defaults to int
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [81]:
## from a scalar, which repeats to fill index
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [82]:
#from a dict, where index defaults to sorted dict keys
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [83]:
# index can be explicitly set
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

# Pandas DataFrame Object

In [84]:
# DataFrame as generalized NumPy array

# dataframe like a 2-d array with flexible row indices and column names
# df is like a sequence of aligned Series objects in that they share the same index

area_dict = {'CA': 423967,
             'TX': 170312,
             'NY': 149995,
             'FL': 141297,
             'IL': 695662}
area = pd.Series(area_dict)

states = pd.DataFrame({'population': population,
                        'area': area})
states

Unnamed: 0,area,population
CA,423967,38332521
FL,141297,19552860
IL,695662,12882135
NY,149995,19651127
TX,170312,26448193


In [85]:
states.index

Index(['CA', 'FL', 'IL', 'NY', 'TX'], dtype='object')

In [86]:
states.columns

Index(['area', 'population'], dtype='object')

In [87]:
# DataFrame as a specialized dictionary

# dict maps a key to a value, df maps a column name to a Series of column data

states['area']

CA    423967
FL    141297
IL    695662
NY    149995
TX    170312
Name: area, dtype: int64

In [88]:
# note:
# in a NumPy array, data[0] returns the first row
# in a Pandas DF, data['col0'] would return the first column
# so better to think of a DF as a generalized dict than an array

In [89]:
# constructing DataFrame objects

In [90]:
# from a single Series object
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
CA,38332521
FL,19552860
IL,12882135
NY,19651127
TX,26448193


In [91]:
# from a list of dicts
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [92]:
# missing keys will be filled with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 1, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,1,4.0


In [93]:
# from a dictionary of Series objects
pd.DataFrame({'population': population,
              'area': area})

Unnamed: 0,area,population
CA,423967,38332521
FL,141297,19552860
IL,695662,12882135
NY,149995,19651127
TX,170312,26448193


In [94]:
# from a 2 dimensional NumPy array
import numpy as np
pd.DataFrame(np.random.rand(3,2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.492952,0.976662
b,0.913008,0.955432
c,0.042872,0.238983


In [95]:
# from a numpy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)
pd.DataFrame(A)

[(0,  0.) (0,  0.) (0,  0.)]


Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# The Pandas Index Object

In [96]:
# index object can be thought of as either
# an immutable array or as an ordered set

ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [97]:
# index as immutable array
# we can use indexing notation to retrieve values/slices

print(ind[1])
print(ind[::2])

3
Int64Index([2, 5, 11], dtype='int64')


In [98]:
# index attributes
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [99]:
# index is immutable, can't be modified normally
# this will return error:
# ind[1] = 0

In [100]:
# index as ordered set

# Index object follows many conventions used by "set" data structure
# including unions, intersections, differences, and other computations

In [101]:
# intersection
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [102]:
# union
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [103]:
# symmetric difference
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

# Data Indexing and Selection

In [104]:
# Data Selection in Series

In [105]:
# Series as dictionary

In [106]:
# like dict, series provides mapping from keys to values
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
data['b']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


0.5

In [107]:
# we can use dictionary-like expressions
print('a' in data)
print(data.keys())
print(list(data.items()))

True
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [108]:
# series can be modified with dict-like syntax
# extending series by assigning to new index value
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [109]:
# Series as a 1-dimensional array

In [110]:
# Series provides array-style item selection

In [111]:
# slicing by explicit index
data['a':'c']

# note that "c" is included when slicing by explicit index

a    0.25
b    0.50
c    0.75
dtype: float64

In [112]:
# slicing by implicit integer index
data[0:2]

# note that "c" is excluded when slicing by implicit index

a    0.25
b    0.50
dtype: float64

In [113]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [114]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [115]:
# Indexers: loc, iloc, and ix

# these can be confusing
# if Series has explicit integer index, data[1] will use explicit indices
# while a slicing operation like data[1:3] will use implicit Python-style index

# special attributes are provided for consistent indexing

In [116]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [117]:
# explicit index # is used when indexing
data[1]

'a'

In [118]:
# but implicit index (row #) is used when slicing
data[1:3]

3    b
5    c
dtype: object

In [119]:
# loc attribute
# allows indexing and slicing that always references the EXPLICIT index

print( data.loc[1] )
print( data.loc[1:3] )

a
1    a
3    b
dtype: object


In [120]:
# iloc attribute
# allows indexing and slicing that always references the IMPLICIT Python-style index

print( data.iloc[1] )
print( data.iloc[1:3] )

b
3    b
5    c
dtype: object


In [121]:
# ix is a hybrid of the two, will explain below

In [122]:
# in Python, "explicit is better than implicit"
# book recommends loc and iloc for clearer code and to prevent bugs

# Data Selection in DataFrame

In [123]:
# DF acts in ways like a 2d or structured array
# and in other ways like a dictionary of Series structures sharing the same index

In [124]:
# DataFrame as a Dictionary

In [125]:
area = pd.Series({'California': 423967, 'Texas': 695662,'New York': 141297, 'Florida': 170312, 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [126]:
# access a Series via dictionary-style indexing of column name:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [127]:
# or use attribute-style access with column names (if the names are strings)
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [128]:
# both methods are the same if column name is a string
print(data.area is data['area'])
# this doesn't always work, for example, "pop" is a method of dataframe
print(data.pop is data['pop'])
# use:
# data['pop'] = z
# not:
# data.pop = z

True
False


In [129]:
# adding column using dictionary-like syntax
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874
