In [1]:
# introducing pandas objects
# pandas objects can be thought of as enhanced versions of numpy structured arrays
# three fundamental data structures: series, dataframe, index
import numpy as np
import pandas as pd

In [12]:
# pandas series is a one dimensional array of indexed data
# can be created from a list or array
data = pd.Series([1, 2, 3])

# series combines a sequence of values wiht an explicit sequence of indicies
data.values
data.index

# can access data with normal indexing
data[1]
data[1:3]

# numpy array has implicitly defined integer index to access avlues
# pandas series has explicitly defined integer index associated with values
# this gives the series for capabilities
data = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
data["a"]

data = pd.Series([1, 2, 3, 4], index=[2, 3, 5, 7])
data[5]

# can think of a series like a specialization of a python dicitonary
population_dict = {'California': 39538223, 'Texas': 29145505,
'Florida': 21538187, 'New York': 20201249,
'Pennsylvania': 13002700}
population = pd.Series(population_dict)
population["California"]

# constructing series objects
# all have some version of the following: pd.Series(data, index=index)
pd.Series([2, 4, 6]) # index defaults to integer sequence
pd.Series(5, index=[100, 200, 300]) # data can be a scaler that is repeated
pd.Series({2:"a", 1:"b"}) # can be dict where index defaults to dict keys



39538223

In [21]:
# dataframe is an analog of a two dimensional array with explicit row and column indicies
# think of dataframe as sequence of aligned series objects
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
states = pd.DataFrame({"population": population, "area": area}) # constructs a dataframe from the series

# has index attribute that gives access to the index labes
states.index

# also has columns attribute which has the column labels
states.columns

# can also think of it as a specialization of a dictionary
# mapes column name to a series of column data
states["area"]

# can be constructed in several ways
# from a single series object
pd.DataFrame(population, columns=["population"])
# from a list of dicts
data = [{"a": i, "b": 2 * i} for i in range(3)]
pd.DataFrame(data)
# from a dictionary of series objects
pd.DataFrame({"population": population, "area": area}) 
# from td numpy array
pd.DataFrame(np.random.rand(3, 2), columns=["foo", "bar"], index=["a", "b", "c"])
# from numpy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [26]:
# pandas index object
# can be thought of either as an immutable array or as an ordered set
ind = pd.Index([2, 3, 5, 7, 11])

# operates like an array in many ways
ind[1]
ind[::2]

# indicies are immutable
# ind[1] = 0 -> this does not work

# designed to faciitate operations across datasets
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA.intersection(indB)
indA.union(indB)
indA.symmetric_difference(indB)

Index([1, 2, 9, 11], dtype='int64')