# 10 Minutes to pandas

In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

<p>Creating a Series by passing a list of values, letting pandas create a default integer index:</p>

In [140]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

<p>Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:</p>

In [141]:
dates = pd.date_range('20161112', periods=6)
dates

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [142]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-14,-0.26915,-1.323284,2.059457,0.250285
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572
2016-11-16,-0.36059,-0.714773,1.325548,0.759038
2016-11-17,0.896416,0.565129,0.350664,0.434686


<p>Creating a DataFrame by passing a dict of objects that can be converted to series-like:</p>

In [143]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp(20161112),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo


<p>Having specific dtypes:</p>

In [144]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

<p>See the top and bottom rows of the frame:</p>

In [145]:
df.head()

Unnamed: 0,A,B,C,D
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-14,-0.26915,-1.323284,2.059457,0.250285
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572
2016-11-16,-0.36059,-0.714773,1.325548,0.759038


In [146]:
df.tail(3)

Unnamed: 0,A,B,C,D
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572
2016-11-16,-0.36059,-0.714773,1.325548,0.759038
2016-11-17,0.896416,0.565129,0.350664,0.434686


<p>Display the index, columns, and the underlying numpy data:</p>

In [147]:
df.index

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [148]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [149]:
df.values

array([[-0.43962783, -1.28185959,  0.21351613, -1.71229618],
       [-1.05226741, -0.40967252, -0.60565599,  1.07778863],
       [-0.26914977, -1.32328449,  2.05945664,  0.25028455],
       [-0.89503807, -0.50055661, -0.3723767 ,  0.31557163],
       [-0.36059035, -0.71477266,  1.32554781,  0.75903782],
       [ 0.89641622,  0.56512907,  0.35066389,  0.43468562]])

<p>Describe shows a quick statistic summary of your data:</p>

In [150]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.353376,-0.610836,0.495192,0.187512
std,0.686792,0.693423,1.01992,0.98081
min,-1.052267,-1.323284,-0.605656,-1.712296
25%,-0.781186,-1.140088,-0.225903,0.266606
50%,-0.400109,-0.607665,0.28209,0.375129
75%,-0.29201,-0.432394,1.081827,0.67795
max,0.896416,0.565129,2.059457,1.077789


<p>Transposing your data:</p>

In [151]:
df.T

Unnamed: 0,2016-11-12 00:00:00,2016-11-13 00:00:00,2016-11-14 00:00:00,2016-11-15 00:00:00,2016-11-16 00:00:00,2016-11-17 00:00:00
A,-0.439628,-1.052267,-0.26915,-0.895038,-0.36059,0.896416
B,-1.28186,-0.409673,-1.323284,-0.500557,-0.714773,0.565129
C,0.213516,-0.605656,2.059457,-0.372377,1.325548,0.350664
D,-1.712296,1.077789,0.250285,0.315572,0.759038,0.434686


<p>Sorting by an axis:</p>

In [152]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-11-12,-1.712296,0.213516,-1.28186,-0.439628
2016-11-13,1.077789,-0.605656,-0.409673,-1.052267
2016-11-14,0.250285,2.059457,-1.323284,-0.26915
2016-11-15,0.315572,-0.372377,-0.500557,-0.895038
2016-11-16,0.759038,1.325548,-0.714773,-0.36059
2016-11-17,0.434686,0.350664,0.565129,0.896416


<p>Sorting by values:</p>

In [153]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2016-11-14,-0.26915,-1.323284,2.059457,0.250285
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296
2016-11-16,-0.36059,-0.714773,1.325548,0.759038
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-17,0.896416,0.565129,0.350664,0.434686


## Selection

<p>See the indexing documentation Indexing and Selecting Data and MultiIndex/Advanced Indexing.</p>

### Getting

Selecting a single column, which yields a Series, equivalent to df.A:

In [154]:
df['A']

2016-11-12   -0.439628
2016-11-13   -1.052267
2016-11-14   -0.269150
2016-11-15   -0.895038
2016-11-16   -0.360590
2016-11-17    0.896416
Freq: D, Name: A, dtype: float64

<p>Selecting via [ ], which slices the rows.</p>

In [155]:
df[0:3]

Unnamed: 0,A,B,C,D
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-14,-0.26915,-1.323284,2.059457,0.250285


In [156]:
df['20161112':'20161114']

Unnamed: 0,A,B,C,D
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-14,-0.26915,-1.323284,2.059457,0.250285


### Selection by Label

For getting a cross section using a label:

In [157]:
df.loc[dates[0]]

A   -0.439628
B   -1.281860
C    0.213516
D   -1.712296
Name: 2016-11-12 00:00:00, dtype: float64

Selecting on a multi-axis by label:

In [158]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-11-12,-0.439628,-1.28186
2016-11-13,-1.052267,-0.409673
2016-11-14,-0.26915,-1.323284
2016-11-15,-0.895038,-0.500557
2016-11-16,-0.36059,-0.714773
2016-11-17,0.896416,0.565129


Showing label slicing, both endpoints are included:

In [159]:
df.loc['20161112':'20161114',['A','B']]

Unnamed: 0,A,B
2016-11-12,-0.439628,-1.28186
2016-11-13,-1.052267,-0.409673
2016-11-14,-0.26915,-1.323284


Reduction in the dimensions of the returned object:

In [160]:
df.loc['20161112',['A','B']]

A   -0.439628
B   -1.281860
Name: 2016-11-12 00:00:00, dtype: float64

For getting a scalar value:

In [161]:
df.loc[dates[0],'A']

-0.43962783231054658

For getting fast access to a scalar value (equivalent to the prior method):

In [162]:
df.at[dates[0],'A']

-0.43962783231054658

### Selection by Position

Select via the position of the passed integers:

In [163]:
df.iloc[3]

A   -0.895038
B   -0.500557
C   -0.372377
D    0.315572
Name: 2016-11-15 00:00:00, dtype: float64

By integer slices, acting similar to the numpy/python style:

In [164]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2016-11-15,-0.895038,-0.500557
2016-11-16,-0.36059,-0.714773


By lists of integer position locations, similar to the numpy/python style:

In [165]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2016-11-13,-1.052267,-0.605656
2016-11-14,-0.26915,2.059457
2016-11-16,-0.36059,1.325548


For slicing rows explicitly:

In [166]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789
2016-11-14,-0.26915,-1.323284,2.059457,0.250285


For slicing columns explicitly:

In [167]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2016-11-12,-1.28186,0.213516
2016-11-13,-0.409673,-0.605656
2016-11-14,-1.323284,2.059457
2016-11-15,-0.500557,-0.372377
2016-11-16,-0.714773,1.325548
2016-11-17,0.565129,0.350664


For getting a value explicitly:

In [168]:
df.iloc[1,1]

-0.40967251646225655

For getting fast access to a scalar (equivalent to the prior method):

In [169]:
df.iat[1,1]

-0.40967251646225655

### Boolean Indexing

Using a single column's values to select data:

In [170]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2016-11-17,0.896416,0.565129,0.350664,0.434686


A where operation for getting:

In [171]:
df[df > 0]

Unnamed: 0,A,B,C,D
2016-11-12,,,0.213516,
2016-11-13,,,,1.077789
2016-11-14,,,2.059457,0.250285
2016-11-15,,,,0.315572
2016-11-16,,,1.325548,0.759038
2016-11-17,0.896416,0.565129,0.350664,0.434686


Using the isin() method for filtering:

In [172]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296,one
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789,one
2016-11-14,-0.26915,-1.323284,2.059457,0.250285,two
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572,three
2016-11-16,-0.36059,-0.714773,1.325548,0.759038,four
2016-11-17,0.896416,0.565129,0.350664,0.434686,three


In [173]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2016-11-14,-0.26915,-1.323284,2.059457,0.250285,two
2016-11-16,-0.36059,-0.714773,1.325548,0.759038,four


### Setting

Setting a new column automatically aligns the data by the indexes:

In [174]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20161112', periods=6))
s1

2016-11-12    1
2016-11-13    2
2016-11-14    3
2016-11-15    4
2016-11-16    5
2016-11-17    6
Freq: D, dtype: int64

In [175]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2016-11-12,-0.439628,-1.28186,0.213516,-1.712296,1
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789,2
2016-11-14,-0.26915,-1.323284,2.059457,0.250285,3
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572,4
2016-11-16,-0.36059,-0.714773,1.325548,0.759038,5
2016-11-17,0.896416,0.565129,0.350664,0.434686,6


Setting values by label:

In [176]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,-1.28186,0.213516,-1.712296,1
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789,2
2016-11-14,-0.26915,-1.323284,2.059457,0.250285,3
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572,4
2016-11-16,-0.36059,-0.714773,1.325548,0.759038,5
2016-11-17,0.896416,0.565129,0.350664,0.434686,6


Setting values by position:

In [177]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.213516,-1.712296,1
2016-11-13,-1.052267,-0.409673,-0.605656,1.077789,2
2016-11-14,-0.26915,-1.323284,2.059457,0.250285,3
2016-11-15,-0.895038,-0.500557,-0.372377,0.315572,4
2016-11-16,-0.36059,-0.714773,1.325548,0.759038,5
2016-11-17,0.896416,0.565129,0.350664,0.434686,6


Setting by assigning with a numpy array:

In [178]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.213516,5,1
2016-11-13,-1.052267,-0.409673,-0.605656,5,2
2016-11-14,-0.26915,-1.323284,2.059457,5,3
2016-11-15,-0.895038,-0.500557,-0.372377,5,4
2016-11-16,-0.36059,-0.714773,1.325548,5,5
2016-11-17,0.896416,0.565129,0.350664,5,6


In [179]:
# Make DataFrame df match tutorial df
df.loc[:,'F'] = [np.nan, 1,2,3,4,5]
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.213516,5,
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0
2016-11-16,-0.36059,-0.714773,1.325548,5,4.0
2016-11-17,0.896416,0.565129,0.350664,5,5.0


A where operation with setting:

In [180]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-0.213516,-5,
2016-11-13,-1.052267,-0.409673,-0.605656,-5,-1.0
2016-11-14,-0.26915,-1.323284,-2.059457,-5,-2.0
2016-11-15,-0.895038,-0.500557,-0.372377,-5,-3.0
2016-11-16,-0.36059,-0.714773,-1.325548,-5,-4.0
2016-11-17,-0.896416,-0.565129,-0.350664,-5,-5.0


## Missing Data

pandas primarily (?!) uses the value np.nan to represent missing data.  
It is by default not included in computations.  


Reindexing allows you to change/add/delete the index on a specified axis.  
This returns a copy of the data.  

In [181]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.213516,5,,
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0,
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0,
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0,


In [182]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.213516,5,,1.0
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0,1.0
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0,
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0,


To drop any rows that have missing data:

In [183]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0,1.0


In [184]:
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.213516,5,,1.0
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0,1.0
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0,
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0,


Filling missing data:

In [185]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.213516,5,5.0,1.0
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0,1.0
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0,5.0
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0,5.0


To get the boolean mask where values are nan:

In [186]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,False,False,False,False,True,False
2016-11-13,False,False,False,False,False,False
2016-11-14,False,False,False,False,False,True
2016-11-15,False,False,False,False,False,True


## Operations 

See the Basic section on Binary Ops.

### Stats

Operations in general <em>exclude</em> missing data.

Performing a descriptive statistic:

In [187]:
df.mean()

A   -0.280105
B   -0.397193
C    0.495192
D    5.000000
F    3.000000
dtype: float64

Same operation on the other axis:

In [188]:
df.mean(1)

2016-11-12    1.303379
2016-11-13    0.786481
2016-11-14    1.493404
2016-11-15    1.246406
2016-11-16    1.850037
2016-11-17    2.362442
Freq: D, dtype: float64

Operating with objects that have different dimensionality and need alignment.  
In addition, pandas broadcasts along the specified dimension.

In [200]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2016-11-12    NaN
2016-11-13    NaN
2016-11-14    1.0
2016-11-15    3.0
2016-11-16    5.0
2016-11-17    NaN
Freq: D, dtype: float64

In [201]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2016-11-12,,,,,
2016-11-13,,,,,
2016-11-14,-1.26915,-2.323284,1.059457,4.0,1.0
2016-11-15,-3.895038,-3.500557,-3.372377,2.0,0.0
2016-11-16,-5.36059,-5.714773,-3.674452,0.0,-1.0
2016-11-17,,,,,


### Apply

Applying functions to the data:

In [203]:
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.213516,5,
2016-11-13,-1.052267,-0.409673,-0.605656,5,1.0
2016-11-14,-0.26915,-1.323284,2.059457,5,2.0
2016-11-15,-0.895038,-0.500557,-0.372377,5,3.0
2016-11-16,-0.36059,-0.714773,1.325548,5,4.0
2016-11-17,0.896416,0.565129,0.350664,5,5.0


In [202]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.213516,5,
2016-11-13,-1.052267,-0.409673,-0.39214,10,1.0
2016-11-14,-1.321417,-1.732957,1.667317,15,3.0
2016-11-15,-2.216455,-2.233514,1.29494,20,6.0
2016-11-16,-2.577046,-2.948286,2.620488,25,10.0
2016-11-17,-1.680629,-2.383157,2.971152,30,15.0


In [204]:
df.apply(lambda x: x.max() - x.min())

A    1.948684
B    1.888414
C    2.665113
D    0.000000
F    4.000000
dtype: float64

### Histogramming

See more at Histogramming and Discretization.

In [205]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    4
1    1
2    5
3    1
4    1
5    3
6    5
7    5
8    6
9    6
dtype: int64

In [206]:
s.value_counts()

5    3
1    3
6    2
4    1
3    1
dtype: int64

### String Methods

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below.  
Note that the pattern-matching in str generally uses regular expressions by default, and in some cases always uses regexes.  
See more at Vectorized String Methods. 

In [207]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [208]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge