# 10 Minutes to pandas

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

<p>Creating a Series by passing a list of values, letting pandas create a default integer index:</p>

In [18]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

<p>Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:</p>

In [19]:
dates = pd.date_range('20161112', periods=6)
dates

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [20]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-14,-0.324606,0.591878,1.152705,-0.595503
2016-11-15,-0.172166,-0.433167,1.922695,0.489763
2016-11-16,0.203322,0.025313,0.77607,0.624141
2016-11-17,0.316484,1.513948,-0.498564,-0.023732


<p>Creating a DataFrame by passing a dict of objects that can be converted to series-like:</p>

In [21]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp(20161112),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo


<p>Having specific dtypes:</p>

In [22]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

<p>See the top and bottom rows of the frame:</p>

In [23]:
df.head()

Unnamed: 0,A,B,C,D
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-14,-0.324606,0.591878,1.152705,-0.595503
2016-11-15,-0.172166,-0.433167,1.922695,0.489763
2016-11-16,0.203322,0.025313,0.77607,0.624141


In [24]:
df.tail(3)

Unnamed: 0,A,B,C,D
2016-11-15,-0.172166,-0.433167,1.922695,0.489763
2016-11-16,0.203322,0.025313,0.77607,0.624141
2016-11-17,0.316484,1.513948,-0.498564,-0.023732


<p>Display the index, columns, and the underlying numpy data:</p>

In [25]:
df.index

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [26]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [27]:
df.values

array([[-1.68502533, -0.67164938, -0.80683187, -0.61982533],
       [ 1.0043151 , -1.02268066,  1.23668295,  1.79117751],
       [-0.32460627,  0.59187842,  1.15270517, -0.59550308],
       [-0.17216571, -0.43316653,  1.92269511,  0.48976288],
       [ 0.20332204,  0.02531329,  0.77607017,  0.62414113],
       [ 0.31648411,  1.51394798, -0.49856356, -0.02373181]])

<p>Describe shows a quick statistic summary of your data:</p>

In [28]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.109613,0.000607,0.63046,0.27767
std,0.900265,0.930335,1.065065,0.907139
min,-1.685025,-1.022681,-0.806832,-0.619825
25%,-0.286496,-0.612029,-0.179905,-0.45256
50%,0.015578,-0.203927,0.964388,0.233016
75%,0.288194,0.450237,1.215689,0.590547
max,1.004315,1.513948,1.922695,1.791178


<p>Transposing your data:</p>

In [29]:
df.T

Unnamed: 0,2016-11-12 00:00:00,2016-11-13 00:00:00,2016-11-14 00:00:00,2016-11-15 00:00:00,2016-11-16 00:00:00,2016-11-17 00:00:00
A,-1.685025,1.004315,-0.324606,-0.172166,0.203322,0.316484
B,-0.671649,-1.022681,0.591878,-0.433167,0.025313,1.513948
C,-0.806832,1.236683,1.152705,1.922695,0.77607,-0.498564
D,-0.619825,1.791178,-0.595503,0.489763,0.624141,-0.023732


<p>Sorting by an axis:</p>

In [30]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-11-12,-0.619825,-0.806832,-0.671649,-1.685025
2016-11-13,1.791178,1.236683,-1.022681,1.004315
2016-11-14,-0.595503,1.152705,0.591878,-0.324606
2016-11-15,0.489763,1.922695,-0.433167,-0.172166
2016-11-16,0.624141,0.77607,0.025313,0.203322
2016-11-17,-0.023732,-0.498564,1.513948,0.316484


<p>Sorting by values:</p>

In [31]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825
2016-11-15,-0.172166,-0.433167,1.922695,0.489763
2016-11-16,0.203322,0.025313,0.77607,0.624141
2016-11-14,-0.324606,0.591878,1.152705,-0.595503
2016-11-17,0.316484,1.513948,-0.498564,-0.023732


## Selection

<p>See the indexing documentation Indexing and Selecting Data and MultiIndex/Advanced Indexing.</p>

### Getting

Selecting a single column, which yields a Series, equivalent to df.A:

In [32]:
df['A']

2016-11-12   -1.685025
2016-11-13    1.004315
2016-11-14   -0.324606
2016-11-15   -0.172166
2016-11-16    0.203322
2016-11-17    0.316484
Freq: D, Name: A, dtype: float64

<p>Selecting via [ ], which slices the rows.</p>

In [33]:
df[0:3]

Unnamed: 0,A,B,C,D
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-14,-0.324606,0.591878,1.152705,-0.595503


In [34]:
df['20161112':'20161114']

Unnamed: 0,A,B,C,D
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-14,-0.324606,0.591878,1.152705,-0.595503


### Selection by Label

For getting a cross section using a label:

In [35]:
df.loc[dates[0]]

A   -1.685025
B   -0.671649
C   -0.806832
D   -0.619825
Name: 2016-11-12 00:00:00, dtype: float64

Selecting on a multi-axis by label:

In [36]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-11-12,-1.685025,-0.671649
2016-11-13,1.004315,-1.022681
2016-11-14,-0.324606,0.591878
2016-11-15,-0.172166,-0.433167
2016-11-16,0.203322,0.025313
2016-11-17,0.316484,1.513948


Showing label slicing, both endpoints are included:

In [38]:
df.loc['20161112':'20161114',['A','B']]

Unnamed: 0,A,B
2016-11-12,-1.685025,-0.671649
2016-11-13,1.004315,-1.022681
2016-11-14,-0.324606,0.591878


Reduction in the dimensions of the returned object:

In [39]:
df.loc['20161112',['A','B']]

A   -1.685025
B   -0.671649
Name: 2016-11-12 00:00:00, dtype: float64

For getting a scalar value:

In [40]:
df.loc[dates[0],'A']

-1.6850253328968465

For getting fast access to a scalar value (equivalent to the prior method):

In [41]:
df.at[dates[0],'A']

-1.6850253328968465

### Selection by Position

Select via the position of the passed integers:

In [42]:
df.iloc[3]

A   -0.172166
B   -0.433167
C    1.922695
D    0.489763
Name: 2016-11-15 00:00:00, dtype: float64

By integer slices, acting similar to the numpy/python style:

In [43]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2016-11-15,-0.172166,-0.433167
2016-11-16,0.203322,0.025313


By lists of integer position locations, similar to the numpy/python style:

In [44]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2016-11-13,1.004315,1.236683
2016-11-14,-0.324606,1.152705
2016-11-16,0.203322,0.77607


For slicing rows explicitly:

In [45]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-14,-0.324606,0.591878,1.152705,-0.595503


For slicing columns explicitly:

In [46]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2016-11-12,-0.671649,-0.806832
2016-11-13,-1.022681,1.236683
2016-11-14,0.591878,1.152705
2016-11-15,-0.433167,1.922695
2016-11-16,0.025313,0.77607
2016-11-17,1.513948,-0.498564


For getting a value explicitly:

In [47]:
df.iloc[1,1]

-1.0226806576159055

For getting fast access to a scalar (equivalent to the prior method):

In [48]:
df.iat[1,1]

-1.0226806576159055

### Boolean Indexing

Using a single column's values to select data:

In [49]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2016-11-13,1.004315,-1.022681,1.236683,1.791178
2016-11-16,0.203322,0.025313,0.77607,0.624141
2016-11-17,0.316484,1.513948,-0.498564,-0.023732


A where operation for getting:

In [50]:
df[df > 0]

Unnamed: 0,A,B,C,D
2016-11-12,,,,
2016-11-13,1.004315,,1.236683,1.791178
2016-11-14,,0.591878,1.152705,
2016-11-15,,,1.922695,0.489763
2016-11-16,0.203322,0.025313,0.77607,0.624141
2016-11-17,0.316484,1.513948,,


Using the isin() method for filtering:

In [52]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825,one
2016-11-13,1.004315,-1.022681,1.236683,1.791178,one
2016-11-14,-0.324606,0.591878,1.152705,-0.595503,two
2016-11-15,-0.172166,-0.433167,1.922695,0.489763,three
2016-11-16,0.203322,0.025313,0.77607,0.624141,four
2016-11-17,0.316484,1.513948,-0.498564,-0.023732,three


In [53]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2016-11-14,-0.324606,0.591878,1.152705,-0.595503,two
2016-11-16,0.203322,0.025313,0.77607,0.624141,four


### Setting

Setting a new column automatically aligns the data by the indexes:

In [55]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20161112', periods=6))
s1

2016-11-12    1
2016-11-13    2
2016-11-14    3
2016-11-15    4
2016-11-16    5
2016-11-17    6
Freq: D, dtype: int64

In [56]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2016-11-12,-1.685025,-0.671649,-0.806832,-0.619825,1
2016-11-13,1.004315,-1.022681,1.236683,1.791178,2
2016-11-14,-0.324606,0.591878,1.152705,-0.595503,3
2016-11-15,-0.172166,-0.433167,1.922695,0.489763,4
2016-11-16,0.203322,0.025313,0.77607,0.624141,5
2016-11-17,0.316484,1.513948,-0.498564,-0.023732,6


Setting values by label:

In [58]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,-0.671649,-0.806832,-0.619825,1
2016-11-13,1.004315,-1.022681,1.236683,1.791178,2
2016-11-14,-0.324606,0.591878,1.152705,-0.595503,3
2016-11-15,-0.172166,-0.433167,1.922695,0.489763,4
2016-11-16,0.203322,0.025313,0.77607,0.624141,5
2016-11-17,0.316484,1.513948,-0.498564,-0.023732,6


Setting values by position:

In [59]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-0.806832,-0.619825,1
2016-11-13,1.004315,-1.022681,1.236683,1.791178,2
2016-11-14,-0.324606,0.591878,1.152705,-0.595503,3
2016-11-15,-0.172166,-0.433167,1.922695,0.489763,4
2016-11-16,0.203322,0.025313,0.77607,0.624141,5
2016-11-17,0.316484,1.513948,-0.498564,-0.023732,6


Setting by assigning with a numpy array:

In [60]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-0.806832,5,1
2016-11-13,1.004315,-1.022681,1.236683,5,2
2016-11-14,-0.324606,0.591878,1.152705,5,3
2016-11-15,-0.172166,-0.433167,1.922695,5,4
2016-11-16,0.203322,0.025313,0.77607,5,5
2016-11-17,0.316484,1.513948,-0.498564,5,6


A where operation with setting:

In [61]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-0.806832,-5,-1
2016-11-13,-1.004315,-1.022681,-1.236683,-5,-2
2016-11-14,-0.324606,-0.591878,-1.152705,-5,-3
2016-11-15,-0.172166,-0.433167,-1.922695,-5,-4
2016-11-16,-0.203322,-0.025313,-0.77607,-5,-5
2016-11-17,-0.316484,-1.513948,-0.498564,-5,-6


## Missing Data