In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 4, np.nan, 6, 9])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    9.0
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:



In [3]:
dates = pd.date_range('20170101', periods=31)
dates

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
               '2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
               '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
               '2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
               '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
               '2017-01-29', '2017-01-30', '2017-01-31'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(31, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2017-01-01,-0.120146,-1.150047,0.944391,1.925887
2017-01-02,-0.013463,0.854475,-1.330239,1.338543
2017-01-03,2.143334,-1.114504,-1.274347,1.583937
2017-01-04,-0.745106,1.007129,0.016284,1.604562
2017-01-05,0.400902,1.716986,0.662867,0.579826
2017-01-06,1.309559,0.16085,1.000085,1.331631
2017-01-07,0.123256,-0.813503,0.229567,0.426556
2017-01-08,-1.444094,0.595862,0.47069,-1.080901
2017-01-09,0.757915,0.423425,0.33133,-0.148731
2017-01-10,-0.637207,1.14149,0.852758,-0.553196


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [6]:
df2 = pd.DataFrame({ 'A': 1.,
                     'B': pd.Timestamp('20170523'),
                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'),
                     'E': pd.Categorical(["test", "train", "test", "train"]),
                     'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2017-05-23,1.0,3,test,foo
1,1.0,2017-05-23,1.0,3,train,foo
2,1.0,2017-05-23,1.0,3,test,foo
3,1.0,2017-05-23,1.0,3,train,foo


In [7]:
# Having specific dtypes
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [8]:
df.head()

Unnamed: 0,A,B,C,D
2017-01-01,-0.120146,-1.150047,0.944391,1.925887
2017-01-02,-0.013463,0.854475,-1.330239,1.338543
2017-01-03,2.143334,-1.114504,-1.274347,1.583937
2017-01-04,-0.745106,1.007129,0.016284,1.604562
2017-01-05,0.400902,1.716986,0.662867,0.579826


In [9]:
df.tail(5)

Unnamed: 0,A,B,C,D
2017-01-27,-0.641886,0.752916,-2.017367,0.177929
2017-01-28,-1.709508,0.359093,-0.649446,0.423997
2017-01-29,0.104982,0.784109,0.072509,0.434293
2017-01-30,0.56211,0.884839,-0.266268,0.43597
2017-01-31,0.289706,-0.661967,1.206517,-0.336566


Display the index, columns, and the underlying numpy data

In [10]:
df.index

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
               '2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
               '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
               '2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
               '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
               '2017-01-29', '2017-01-30', '2017-01-31'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.values

array([[-0.12014576, -1.15004743,  0.9443911 ,  1.92588747],
       [-0.01346345,  0.85447542, -1.330239  ,  1.33854251],
       [ 2.14333442, -1.11450382, -1.27434681,  1.58393658],
       [-0.74510551,  1.007129  ,  0.01628352,  1.6045624 ],
       [ 0.40090154,  1.71698551,  0.66286681,  0.57982646],
       [ 1.30955927,  0.16084987,  1.00008463,  1.33163101],
       [ 0.12325577, -0.81350293,  0.22956708,  0.42655593],
       [-1.44409425,  0.59586189,  0.47069047, -1.08090078],
       [ 0.7579147 ,  0.42342461,  0.33132965, -0.14873059],
       [-0.63720679,  1.14149042,  0.85275843, -0.5531962 ],
       [-0.96308687,  1.30788543,  1.70680928, -0.60882984],
       [ 1.07070465,  0.92994773,  1.67420862, -1.07006613],
       [ 0.17109685,  0.02331145,  0.1480088 ,  1.1972427 ],
       [ 0.44721178, -2.09859996, -0.12888246, -0.29019952],
       [-0.06251828,  1.18700416, -0.66754787,  1.18014762],
       [ 1.38145414,  0.5369406 ,  0.30463292,  0.87573524],
       [ 2.257726  , -1.

In [15]:
len(df.values)

31

Describe shows a quick statistic summary of the data

In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,31.0,31.0,31.0,31.0
mean,0.199807,0.23535,0.105457,0.283553
std,1.101262,0.994958,0.980875,0.821788
min,-2.698669,-2.0986,-2.017367,-1.080901
25%,-0.160934,-0.187377,-0.658497,-0.313383
50%,0.217551,0.423425,0.148009,0.226685
75%,0.546113,0.968538,0.862777,0.878872
max,2.503535,1.716986,1.706809,1.925887


Transposing data

In [18]:
df.T

Unnamed: 0,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-01-22 00:00:00,2017-01-23 00:00:00,2017-01-24 00:00:00,2017-01-25 00:00:00,2017-01-26 00:00:00,2017-01-27 00:00:00,2017-01-28 00:00:00,2017-01-29 00:00:00,2017-01-30 00:00:00,2017-01-31 00:00:00
A,-0.120146,-0.013463,2.143334,-0.745106,0.400902,1.309559,0.123256,-1.444094,0.757915,-0.637207,...,2.503535,-0.059287,0.217551,0.514586,0.530116,-0.641886,-1.709508,0.104982,0.56211,0.289706
B,-1.150047,0.854475,-1.114504,1.007129,1.716986,0.16085,-0.813503,0.595862,0.423425,1.14149,...,0.051671,-0.279854,-0.094901,0.018371,1.49291,0.752916,0.359093,0.784109,0.884839,-0.661967
C,0.944391,-1.330239,-1.274347,0.016284,0.662867,1.000085,0.229567,0.47069,0.33133,0.852758,...,0.776924,0.135796,0.883935,1.657045,-1.125627,-2.017367,-0.649446,0.072509,-0.266268,1.206517
D,1.925887,1.338543,1.583937,1.604562,0.579826,1.331631,0.426556,-1.080901,-0.148731,-0.553196,...,0.080317,0.226685,-0.290047,-0.804207,0.59991,0.177929,0.423997,0.434293,0.43597,-0.336566


Sorting by an axis

In [19]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2017-01-01,1.925887,0.944391,-1.150047,-0.120146
2017-01-02,1.338543,-1.330239,0.854475,-0.013463
2017-01-03,1.583937,-1.274347,-1.114504,2.143334
2017-01-04,1.604562,0.016284,1.007129,-0.745106
2017-01-05,0.579826,0.662867,1.716986,0.400902
2017-01-06,1.331631,1.000085,0.16085,1.309559
2017-01-07,0.426556,0.229567,-0.813503,0.123256
2017-01-08,-1.080901,0.47069,0.595862,-1.444094
2017-01-09,-0.148731,0.33133,0.423425,0.757915
2017-01-10,-0.553196,0.852758,1.14149,-0.637207


sorting by values

In [21]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2017-01-14,0.447212,-2.0986,-0.128882,-0.2902
2017-01-17,2.257726,-1.844896,-0.910866,-0.085011
2017-01-21,0.297908,-1.285342,-0.006704,0.882009
2017-01-01,-0.120146,-1.150047,0.944391,1.925887
2017-01-03,2.143334,-1.114504,-1.274347,1.583937
2017-01-07,0.123256,-0.813503,0.229567,0.426556
2017-01-31,0.289706,-0.661967,1.206517,-0.336566
2017-01-23,-0.059287,-0.279854,0.135796,0.226685
2017-01-24,0.217551,-0.094901,0.883935,-0.290047
2017-01-20,-2.698669,-0.053443,-1.598128,-0.490637


# Selection
- ```.at```
- ```.iat```
- ```.loc```
- ```.iloc```
- ```.ix```


## Getting
Selecting a single column, which yiels a ```Series```, equivalent to ```df.A```

In [22]:
df['A']

2017-01-01   -0.120146
2017-01-02   -0.013463
2017-01-03    2.143334
2017-01-04   -0.745106
2017-01-05    0.400902
2017-01-06    1.309559
2017-01-07    0.123256
2017-01-08   -1.444094
2017-01-09    0.757915
2017-01-10   -0.637207
2017-01-11   -0.963087
2017-01-12    1.070705
2017-01-13    0.171097
2017-01-14    0.447212
2017-01-15   -0.062518
2017-01-16    1.381454
2017-01-17    2.257726
2017-01-18   -0.201722
2017-01-19    0.407049
2017-01-20   -2.698669
2017-01-21    0.297908
2017-01-22    2.503535
2017-01-23   -0.059287
2017-01-24    0.217551
2017-01-25    0.514586
2017-01-26    0.530116
2017-01-27   -0.641886
2017-01-28   -1.709508
2017-01-29    0.104982
2017-01-30    0.562110
2017-01-31    0.289706
Freq: D, Name: A, dtype: float64

In [23]:
df.A

2017-01-01   -0.120146
2017-01-02   -0.013463
2017-01-03    2.143334
2017-01-04   -0.745106
2017-01-05    0.400902
2017-01-06    1.309559
2017-01-07    0.123256
2017-01-08   -1.444094
2017-01-09    0.757915
2017-01-10   -0.637207
2017-01-11   -0.963087
2017-01-12    1.070705
2017-01-13    0.171097
2017-01-14    0.447212
2017-01-15   -0.062518
2017-01-16    1.381454
2017-01-17    2.257726
2017-01-18   -0.201722
2017-01-19    0.407049
2017-01-20   -2.698669
2017-01-21    0.297908
2017-01-22    2.503535
2017-01-23   -0.059287
2017-01-24    0.217551
2017-01-25    0.514586
2017-01-26    0.530116
2017-01-27   -0.641886
2017-01-28   -1.709508
2017-01-29    0.104982
2017-01-30    0.562110
2017-01-31    0.289706
Freq: D, Name: A, dtype: float64

selecting via ```[]```, which slices the rows.

In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2017-01-01,-0.120146,-1.150047,0.944391,1.925887
2017-01-02,-0.013463,0.854475,-1.330239,1.338543
2017-01-03,2.143334,-1.114504,-1.274347,1.583937


In [25]:
df['20170102':'20170111']

Unnamed: 0,A,B,C,D
2017-01-02,-0.013463,0.854475,-1.330239,1.338543
2017-01-03,2.143334,-1.114504,-1.274347,1.583937
2017-01-04,-0.745106,1.007129,0.016284,1.604562
2017-01-05,0.400902,1.716986,0.662867,0.579826
2017-01-06,1.309559,0.16085,1.000085,1.331631
2017-01-07,0.123256,-0.813503,0.229567,0.426556
2017-01-08,-1.444094,0.595862,0.47069,-1.080901
2017-01-09,0.757915,0.423425,0.33133,-0.148731
2017-01-10,-0.637207,1.14149,0.852758,-0.553196
2017-01-11,-0.963087,1.307885,1.706809,-0.60883


## Selection by Label
For getting a cross section using a label

In [27]:
df

Unnamed: 0,A,B,C,D
2017-01-01,-0.120146,-1.150047,0.944391,1.925887
2017-01-02,-0.013463,0.854475,-1.330239,1.338543
2017-01-03,2.143334,-1.114504,-1.274347,1.583937
2017-01-04,-0.745106,1.007129,0.016284,1.604562
2017-01-05,0.400902,1.716986,0.662867,0.579826
2017-01-06,1.309559,0.16085,1.000085,1.331631
2017-01-07,0.123256,-0.813503,0.229567,0.426556
2017-01-08,-1.444094,0.595862,0.47069,-1.080901
2017-01-09,0.757915,0.423425,0.33133,-0.148731
2017-01-10,-0.637207,1.14149,0.852758,-0.553196


In [26]:
df.loc[dates[0]]

A   -0.120146
B   -1.150047
C    0.944391
D    1.925887
Name: 2017-01-01 00:00:00, dtype: float64

selecting on a multi-axis by label

In [28]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2017-01-01,-0.120146,-1.150047
2017-01-02,-0.013463,0.854475
2017-01-03,2.143334,-1.114504
2017-01-04,-0.745106,1.007129
2017-01-05,0.400902,1.716986
2017-01-06,1.309559,0.16085
2017-01-07,0.123256,-0.813503
2017-01-08,-1.444094,0.595862
2017-01-09,0.757915,0.423425
2017-01-10,-0.637207,1.14149


Showing label slicing, both endpoints are included

In [29]:
df.loc['20170105':'20170124', ['A', 'B']]

Unnamed: 0,A,B
2017-01-05,0.400902,1.716986
2017-01-06,1.309559,0.16085
2017-01-07,0.123256,-0.813503
2017-01-08,-1.444094,0.595862
2017-01-09,0.757915,0.423425
2017-01-10,-0.637207,1.14149
2017-01-11,-0.963087,1.307885
2017-01-12,1.070705,0.929948
2017-01-13,0.171097,0.023311
2017-01-14,0.447212,-2.0986


Reduction in the dimensions of the returned object

In [None]:
df.loc['']