# 10 minutes for pandas

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# object creation

In [4]:
s = pd.Series([1, 2, 4, np.nan, 6, 8])
s

0    1.0
1    2.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.659954,2.476924,-0.269991,1.732534
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-03,0.8342,-1.112613,-0.729881,0.703685
2013-01-04,2.079416,-1.080034,-0.046418,0.442252
2013-01-05,-0.931143,0.374811,-0.016005,0.323205
2013-01-06,-1.396549,0.140308,-2.133768,-1.297131


In [10]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'F': 'foostring'})
df2

Unnamed: 0,A,B,C,D,F
0,1.0,2013-01-02,1.0,3,foostring
1,1.0,2013-01-02,1.0,3,foostring
2,1.0,2013-01-02,1.0,3,foostring
3,1.0,2013-01-02,1.0,3,foostring


In [11]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.659954,2.476924,-0.269991,1.732534
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-03,0.8342,-1.112613,-0.729881,0.703685
2013-01-04,2.079416,-1.080034,-0.046418,0.442252
2013-01-05,-0.931143,0.374811,-0.016005,0.323205


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,2.079416,-1.080034,-0.046418,0.442252
2013-01-05,-0.931143,0.374811,-0.016005,0.323205
2013-01-06,-1.396549,0.140308,-2.133768,-1.297131


In [13]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
df.values

array([[-0.65995362,  2.47692431, -0.26999101,  1.73253438],
       [ 0.39729616, -0.54767821,  0.42672917, -0.27348736],
       [ 0.83419952, -1.11261284, -0.72988073,  0.70368526],
       [ 2.07941642, -1.08003434, -0.04641788,  0.44225188],
       [-0.93114341,  0.3748112 , -0.01600541,  0.32320511],
       [-1.3965494 ,  0.1403083 , -2.13376817, -1.29713107]])

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.053878,0.041953,-0.461556,0.271843
std,1.297167,1.340828,0.901653,1.011024
min,-1.396549,-1.112613,-2.133768,-1.297131
25%,-0.863346,-0.946945,-0.614908,-0.124314
50%,-0.131329,-0.203685,-0.158204,0.382728
75%,0.724974,0.316185,-0.023609,0.638327
max,2.079416,2.476924,0.426729,1.732534


In [19]:
# Transposing
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.659954,0.397296,0.8342,2.079416,-0.931143,-1.396549
B,2.476924,-0.547678,-1.112613,-1.080034,0.374811,0.140308
C,-0.269991,0.426729,-0.729881,-0.046418,-0.016005,-2.133768
D,1.732534,-0.273487,0.703685,0.442252,0.323205,-1.297131


In [26]:
# Sorting by axis -- in this case by columns
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.732534,-0.269991,2.476924,-0.659954
2013-01-02,-0.273487,0.426729,-0.547678,0.397296
2013-01-03,0.703685,-0.729881,-1.112613,0.8342
2013-01-04,0.442252,-0.046418,-1.080034,2.079416
2013-01-05,0.323205,-0.016005,0.374811,-0.931143
2013-01-06,-1.297131,-2.133768,0.140308,-1.396549


In [27]:
# Or sorting everything by values:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,0.8342,-1.112613,-0.729881,0.703685
2013-01-04,2.079416,-1.080034,-0.046418,0.442252
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-06,-1.396549,0.140308,-2.133768,-1.297131
2013-01-05,-0.931143,0.374811,-0.016005,0.323205
2013-01-01,-0.659954,2.476924,-0.269991,1.732534


# selection: getting, indexing, etc

In [29]:
df['A']

2013-01-01   -0.659954
2013-01-02    0.397296
2013-01-03    0.834200
2013-01-04    2.079416
2013-01-05   -0.931143
2013-01-06   -1.396549
Freq: D, Name: A, dtype: float64

In [30]:
# Slicing
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.659954,2.476924,-0.269991,1.732534
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-03,0.8342,-1.112613,-0.729881,0.703685


In [34]:
df.loc[dates[0]] # Can use this for grabbing specific subjects!

A   -0.659954
B    2.476924
C   -0.269991
D    1.732534
Name: 2013-01-01 00:00:00, dtype: float64

In [48]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.659954,2.476924
2013-01-02,0.397296,-0.547678
2013-01-03,0.8342,-1.112613
2013-01-04,2.079416,-1.080034
2013-01-05,-0.931143,0.374811
2013-01-06,-1.396549,0.140308


In [49]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.397296,-0.547678
2013-01-03,0.8342,-1.112613
2013-01-04,2.079416,-1.080034


In [50]:
df.loc['20130102', ['A', 'B']]

A    0.397296
B   -0.547678
Name: 2013-01-02 00:00:00, dtype: float64

In [59]:
# Getting a scalar value:
df.loc[dates[0], 'A']

-0.65995362020939685

In [58]:
# And this is a fast way of doing so:
df.at[dates[0], 'A']

-0.65995362020939685

# selection by position

In [64]:
# i.e., the row three
df.iloc[3]

A    2.079416
B   -1.080034
C   -0.046418
D    0.442252
Name: 2013-01-04 00:00:00, dtype: float64

In [66]:
# integer slices, like numpy
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,2.079416,-1.080034
2013-01-05,-0.931143,0.374811


In [68]:
# 1, 2, and 4 rows, and 'A' and 'C'
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.397296,0.426729
2013-01-03,0.8342,-0.729881
2013-01-05,-0.931143,-0.016005


In [69]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-03,0.8342,-1.112613,-0.729881,0.703685


In [70]:
# Getting a scalar:
df.iloc[1,1]

-0.54767821401071115

In [71]:
# Fast access to a scalar:
df.iat[1,1]

-0.54767821401071115

Note that `.at` and `.iat` are for accessing scalars quickly. They're probably not as general as `.loc` and `iloc`, respectively. 

# boolean indexing

In [72]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.397296,-0.547678,0.426729,-0.273487
2013-01-03,0.8342,-1.112613,-0.729881,0.703685
2013-01-04,2.079416,-1.080034,-0.046418,0.442252


In [73]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,2.476924,,1.732534
2013-01-02,0.397296,,0.426729,
2013-01-03,0.8342,,,0.703685
2013-01-04,2.079416,,,0.442252
2013-01-05,,0.374811,,0.323205
2013-01-06,,0.140308,,


In [78]:
# For filtering, we use the isin() method:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.659954,2.476924,-0.269991,1.732534,one
2013-01-02,0.397296,-0.547678,0.426729,-0.273487,one
2013-01-03,0.8342,-1.112613,-0.729881,0.703685,two
2013-01-04,2.079416,-1.080034,-0.046418,0.442252,three
2013-01-05,-0.931143,0.374811,-0.016005,0.323205,four
2013-01-06,-1.396549,0.140308,-2.133768,-1.297131,three


In [79]:
df2[df2['E'].isin(['two', 'three'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.8342,-1.112613,-0.729881,0.703685,two
2013-01-04,2.079416,-1.080034,-0.046418,0.442252,three
2013-01-06,-1.396549,0.140308,-2.133768,-1.297131,three
