In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

I'm working through the content at http://pandas.pydata.org/pandas-docs/stable/10min.html, for muscle memory.

#Object creation

In [4]:
s = pd.Series([1,2,3,np.nan,6,8])
s

0     1
1     2
2     3
3   NaN
4     6
5     8
dtype: float64

In [5]:
type(s)

pandas.core.series.Series

In [6]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D', tz=None)

In [7]:
type(dates)

pandas.tseries.index.DatetimeIndex

In [14]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253
2013-01-06,1.189294,-1.871669,0.393592,1.153088


In [36]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [17]:
d = { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
      'D' : np.array([3] * 4,dtype='int32'),
      'E' : pd.Categorical(["test","train","test","train"]),
      'F' : 'foo' }

In [20]:
d

{'A': 1.0, 'B': Timestamp('2013-01-02 00:00:00'), 'C': 0    1
 1    1
 2    1
 3    1
 dtype: float32, 'D': array([3, 3, 3, 3], dtype=int32), 'E': [test, train, test, train]
 Categories (2, object): [test, train], 'F': 'foo'}

In [31]:
d['E']

[test, train, test, train]
Categories (2, object): [test, train]

In [33]:
df2 = pd.DataFrame(d)
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1,3,test,foo
1,1,2013-01-02,1,3,train,foo
2,1,2013-01-02,1,3,test,foo
3,1,2013-01-02,1,3,train,foo


In [35]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#Viewing data

In [37]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253


In [38]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253
2013-01-06,1.189294,-1.871669,0.393592,1.153088


In [39]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D', tz=None)

In [40]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [41]:
df.values

array([[-0.43150339, -0.387704  ,  1.20737713, -1.24287025],
       [-0.66049673, -0.40390274,  1.48480714,  0.08657012],
       [-0.22759732,  0.26350966,  2.85751615,  0.21078501],
       [ 0.12277265,  1.12553445,  1.68793053, -0.15981842],
       [-1.410673  ,  0.2659451 , -0.05162857, -0.52725272],
       [ 1.18929429, -1.87166889,  0.39359218,  1.1530879 ]])

In [42]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.236367,-0.168048,1.263266,-0.079916
std,0.867009,1.0054,1.025618,0.798934
min,-1.410673,-1.871669,-0.051629,-1.24287
25%,-0.603248,-0.399853,0.597038,-0.435394
50%,-0.32955,-0.062097,1.346092,-0.036624
75%,0.03518,0.265336,1.63715,0.179731
max,1.189294,1.125534,2.857516,1.153088


In [43]:
df['A']

2013-01-01   -0.431503
2013-01-02   -0.660497
2013-01-03   -0.227597
2013-01-04    0.122773
2013-01-05   -1.410673
2013-01-06    1.189294
Freq: D, Name: A, dtype: float64

In [46]:
df.A.median()

-0.32955035443655295

In [49]:
(df.A[2] + df.A[3])/2

-0.052412337808063925

In [50]:
sorted(df.A)

[-1.4106729997610614,
 -0.66049672961771222,
 -0.43150338510815839,
 -0.22759732376494754,
 0.12277264814881969,
 1.1892942947114351]

In [51]:
(sorted(df.A)[2] + sorted(df.A)[3])/2

-0.32955035443655295

Transpose

In [53]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.431503,-0.660497,-0.227597,0.122773,-1.410673,1.189294
B,-0.387704,-0.403903,0.26351,1.125534,0.265945,-1.871669
C,1.207377,1.484807,2.857516,1.687931,-0.051629,0.393592
D,-1.24287,0.08657,0.210785,-0.159818,-0.527253,1.153088


Sort by an axis - I think this means here sort the actual column headers?

In [56]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.24287,1.207377,-0.387704,-0.431503
2013-01-02,0.08657,1.484807,-0.403903,-0.660497
2013-01-03,0.210785,2.857516,0.26351,-0.227597
2013-01-04,-0.159818,1.687931,1.125534,0.122773
2013-01-05,-0.527253,-0.051629,0.265945,-1.410673
2013-01-06,1.153088,0.393592,-1.871669,1.189294


Sort by the values (in a particular column)

In [59]:
df.sort(columns='B')

Unnamed: 0,A,B,C,D
2013-01-06,1.189294,-1.871669,0.393592,1.153088
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287
2013-01-03,-0.227597,0.26351,2.857516,0.210785
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253
2013-01-04,0.122773,1.125534,1.687931,-0.159818


#Selection

The doc says "standard Python/Numpy select/setting expressions are intuitive and come in handy for interactive work, for production code, [they] recommend the optimized Pandas data access methods, .at, .iat, .loc, .iloc, and .ix".

##Getting

In [61]:
df['A']

2013-01-01   -0.431503
2013-01-02   -0.660497
2013-01-03   -0.227597
2013-01-04    0.122773
2013-01-05   -1.410673
2013-01-06    1.189294
Freq: D, Name: A, dtype: float64

In [62]:
df.A


2013-01-01   -0.431503
2013-01-02   -0.660497
2013-01-03   -0.227597
2013-01-04    0.122773
2013-01-05   -1.410673
2013-01-06    1.189294
Freq: D, Name: A, dtype: float64

In [63]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785


In [65]:
df[1:2]

Unnamed: 0,A,B,C,D
2013-01-02,-0.660497,-0.403903,1.484807,0.08657


In [66]:
df[2:2]

Unnamed: 0,A,B,C,D


In [68]:
df[3:]

Unnamed: 0,A,B,C,D
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253
2013-01-06,1.189294,-1.871669,0.393592,1.153088


In [72]:
df[:2]

Unnamed: 0,A,B,C,D
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287
2013-01-02,-0.660497,-0.403903,1.484807,0.08657


In [64]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785
2013-01-04,0.122773,1.125534,1.687931,-0.159818


##Selection by label

In [74]:
df.loc['20130101']

A   -0.431503
B   -0.387704
C    1.207377
D   -1.242870
Name: 2013-01-01 00:00:00, dtype: float64

In [75]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.431503,-0.387704
2013-01-02,-0.660497,-0.403903
2013-01-03,-0.227597,0.26351
2013-01-04,0.122773,1.125534
2013-01-05,-1.410673,0.265945
2013-01-06,1.189294,-1.871669


With slicing - 'label slicing' - both endpoints are included.

In [82]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.660497,-0.403903
2013-01-03,-0.227597,0.26351
2013-01-04,0.122773,1.125534


In [83]:
df.loc['20130102',['A','B']]

A   -0.660497
B   -0.403903
Name: 2013-01-02 00:00:00, dtype: float64

In [84]:
type(df.loc['20130102',['A','B']])

pandas.core.series.Series

In [85]:
df.loc['20130101','A']

-0.43150338510815839

Same as previous method - get a scalar value.

In [89]:
df.at[dates[0],'A']

-0.43150338510815839

In [90]:
dates[0]

Timestamp('2013-01-01 00:00:00', offset='D')

In [92]:
pd.Timestamp('20130101')

Timestamp('2013-01-01 00:00:00')

In [93]:
df.at[pd.Timestamp('20130101'),'A']

-0.43150338510815839

##Selection by position

In [99]:
type(df.iloc[3])

pandas.core.series.Series

In [94]:
df.iloc[3]

A    0.122773
B    1.125534
C    1.687931
D   -0.159818
Name: 2013-01-04 00:00:00, dtype: float64

Previous is same as Numpy/Python indexing, like the following. Seems like the former returns a Series, while the latter returns a DataFrame.

In [98]:
df[3:4]

Unnamed: 0,A,B,C,D
2013-01-04,0.122773,1.125534,1.687931,-0.159818


In [100]:
type(df[3:4])

pandas.core.frame.DataFrame

In [101]:
df.iloc[3:5]

Unnamed: 0,A,B,C,D
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253


In [102]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.122773,1.125534
2013-01-05,-1.410673,0.265945


In [103]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.660497,1.484807
2013-01-03,-0.227597,2.857516
2013-01-05,-1.410673,-0.051629


In [104]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.660497,-0.403903,1.484807,0.08657
2013-01-03,-0.227597,0.26351,2.857516,0.210785


In [105]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.387704,1.207377
2013-01-02,-0.403903,1.484807
2013-01-03,0.26351,2.857516
2013-01-04,1.125534,1.687931
2013-01-05,0.265945,-0.051629
2013-01-06,-1.871669,0.393592


In [106]:
df.iloc[1,1]

-0.40390274008578286

Fast access - same result as previous.

In [107]:
df.iat[1,1]

-0.40390274008578286

##Boolean indexing

In [108]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.122773,1.125534,1.687931,-0.159818
2013-01-06,1.189294,-1.871669,0.393592,1.153088


In [109]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,1.207377,
2013-01-02,,,1.484807,0.08657
2013-01-03,,0.26351,2.857516,0.210785
2013-01-04,0.122773,1.125534,1.687931,
2013-01-05,,0.265945,,
2013-01-06,1.189294,,0.393592,1.153088


In [110]:
dfcopy = df.copy()
dfcopy['E'] = ['one','one','two','three','four','three']
dfcopy

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287,one
2013-01-02,-0.660497,-0.403903,1.484807,0.08657,one
2013-01-03,-0.227597,0.26351,2.857516,0.210785,two
2013-01-04,0.122773,1.125534,1.687931,-0.159818,three
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253,four
2013-01-06,1.189294,-1.871669,0.393592,1.153088,three


In [111]:
dfcopy[dfcopy['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.227597,0.26351,2.857516,0.210785,two
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253,four


##Setting

In [113]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [114]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.431503,-0.387704,1.207377,-1.24287,
2013-01-02,-0.660497,-0.403903,1.484807,0.08657,1.0
2013-01-03,-0.227597,0.26351,2.857516,0.210785,2.0
2013-01-04,0.122773,1.125534,1.687931,-0.159818,3.0
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253,4.0
2013-01-06,1.189294,-1.871669,0.393592,1.153088,5.0


In [115]:
df.at[pd.Timestamp('20130101'),'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.387704,1.207377,-1.24287,
2013-01-02,-0.660497,-0.403903,1.484807,0.08657,1.0
2013-01-03,-0.227597,0.26351,2.857516,0.210785,2.0
2013-01-04,0.122773,1.125534,1.687931,-0.159818,3.0
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253,4.0
2013-01-06,1.189294,-1.871669,0.393592,1.153088,5.0


In [116]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.207377,-1.24287,
2013-01-02,-0.660497,-0.403903,1.484807,0.08657,1.0
2013-01-03,-0.227597,0.26351,2.857516,0.210785,2.0
2013-01-04,0.122773,1.125534,1.687931,-0.159818,3.0
2013-01-05,-1.410673,0.265945,-0.051629,-0.527253,4.0
2013-01-06,1.189294,-1.871669,0.393592,1.153088,5.0


In [118]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.207377,5,
2013-01-02,-0.660497,-0.403903,1.484807,5,1.0
2013-01-03,-0.227597,0.26351,2.857516,5,2.0
2013-01-04,0.122773,1.125534,1.687931,5,3.0
2013-01-05,-1.410673,0.265945,-0.051629,5,4.0
2013-01-06,1.189294,-1.871669,0.393592,5,5.0


In [120]:
dfcopy2 = df.copy()
dfcopy2[dfcopy2 > 0] = -dfcopy2
dfcopy2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.207377,-5,
2013-01-02,-0.660497,-0.403903,-1.484807,-5,-1.0
2013-01-03,-0.227597,-0.26351,-2.857516,-5,-2.0
2013-01-04,-0.122773,-1.125534,-1.687931,-5,-3.0
2013-01-05,-1.410673,-0.265945,-0.051629,-5,-4.0
2013-01-06,-1.189294,-1.871669,-0.393592,-5,-5.0


#Missing data