In [1]:
import pandas as pd
import numpy as np

# Introducing Pandas Objects

Pandas Series Object

In [2]:
data = pd.Series([0.25,0.5,0.75,1.0])
print(data)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
data[2]

0.75

Series as generalized NumPy array

In [6]:
data = pd.Series([0.25,0.5,0.75,1.0],index = ['a','b','c','d'])
print(data)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [8]:
data['c']

0.75

In [10]:
data = pd.Series([0.25,0.5,0.75,1.0],index = [2,5,3,7])
print(data)

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64


Series as specialized dictionary

In [11]:
pop_dict = {'California': 38332521,'Texas': 26448193,'New York': 19651127,'Florida': 19552860,
            'Illinois': 12882135}
population = pd.Series(pop_dict)
print(population)

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


In [12]:
population['California']

38332521

In [13]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [14]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

Constructing Series objects

In [15]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [16]:
pd.Series(5, index = [120,400,650])

120    5
400    5
650    5
dtype: int64

data can be a dictionary, in which index defaults to the sorted dictionary keys

In [17]:
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In each case, the index can be explicitly set if a different result is preferred

In [18]:
pd.Series({2:'a',1:'b',3:'c'}, index = [3,2])

3    c
2    a
dtype: object

The Pandas DataFrame Object

In [19]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [21]:
states = pd.DataFrame({'population':population, 'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [22]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [23]:
states.columns

Index(['population', 'area'], dtype='object')

DataFrame as specialized dictionary

In [24]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

Constructing DataFrame objects

In [25]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [26]:
data = [{'a':i, 'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [27]:
pd.DataFrame([{'a':1, 'b':2},{'b':3,'c':4}])
#Even if some keys in the dictionary are missing, Pandas will fill them in with NaN 
#(i.e., "not a number") values

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [28]:
pd.DataFrame({'population':population, 'area':area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [32]:
pd.DataFrame(np.random.rand(3,3), columns = ['foo', 'bar', 'xyz'], index = ['a','b','c'])

Unnamed: 0,foo,bar,xyz
a,0.603379,0.749691,0.062754
b,0.08639,0.331091,0.671482
c,0.882103,0.047341,0.173351


In [36]:
A = np.zeros(3, dtype = [('A', 'i8'), ('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [37]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


The Pandas Index Object

In [38]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

Index as immutable array

In [43]:
ind[2]

5

In [46]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [47]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [49]:
#One difference between Index objects and NumPy arrays is that indices are immutable
ind[1] = 0

TypeError: Index does not support mutable operations

Index as ordered set

In [50]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 9,11])

In [51]:
indA & indB #intersection

Int64Index([3, 5, 7, 9], dtype='int64')

In [52]:
indA | indB #union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [53]:
indA ^ indB #symmetric difference

Int64Index([1, 2, 11], dtype='int64')

In [54]:
indA.intersection(indB) #same as indA & indB above

Int64Index([3, 5, 7, 9], dtype='int64')