# 10 Minutes to pandas

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

<p>Creating a Series by passing a list of values, letting pandas create a default integer index:</p>

In [92]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

<p>Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:</p>

In [93]:
dates = pd.date_range('20161112', periods=6)
dates

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [94]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-11-12,-0.456684,0.248177,1.466787,0.749017
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739
2016-11-16,0.399177,1.187455,-2.017127,-1.320002
2016-11-17,1.057221,0.158823,-1.729537,-0.494593


<p>Creating a DataFrame by passing a dict of objects that can be converted to series-like:</p>

In [95]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp(20161112),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo


<p>Having specific dtypes:</p>

In [96]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

<p>See the top and bottom rows of the frame:</p>

In [97]:
df.head()

Unnamed: 0,A,B,C,D
2016-11-12,-0.456684,0.248177,1.466787,0.749017
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739
2016-11-16,0.399177,1.187455,-2.017127,-1.320002


In [98]:
df.tail(3)

Unnamed: 0,A,B,C,D
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739
2016-11-16,0.399177,1.187455,-2.017127,-1.320002
2016-11-17,1.057221,0.158823,-1.729537,-0.494593


<p>Display the index, columns, and the underlying numpy data:</p>

In [99]:
df.index

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [100]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [101]:
df.values

array([[-0.45668359,  0.24817697,  1.46678695,  0.74901699],
       [ 0.05099696, -0.47218893,  1.25004909,  0.42980105],
       [ 0.67532588,  0.87945995,  1.5836294 ,  1.45008113],
       [-0.76374777, -1.44814588, -1.3050449 ,  0.36073914],
       [ 0.39917656,  1.18745482, -2.01712679, -1.32000234],
       [ 1.05722055,  0.15882335, -1.72953668, -0.49459267]])

<p>Describe shows a quick statistic summary of your data:</p>

In [102]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.160381,0.092263,-0.125207,0.195841
std,0.68895,0.95282,1.725755,0.973194
min,-0.763748,-1.448146,-2.017127,-1.320002
25%,-0.329763,-0.314436,-1.623414,-0.28076
50%,0.225087,0.2035,-0.027498,0.39527
75%,0.606289,0.721639,1.412602,0.669213
max,1.057221,1.187455,1.583629,1.450081


<p>Transposing your data:</p>

In [103]:
df.T

Unnamed: 0,2016-11-12 00:00:00,2016-11-13 00:00:00,2016-11-14 00:00:00,2016-11-15 00:00:00,2016-11-16 00:00:00,2016-11-17 00:00:00
A,-0.456684,0.050997,0.675326,-0.763748,0.399177,1.057221
B,0.248177,-0.472189,0.87946,-1.448146,1.187455,0.158823
C,1.466787,1.250049,1.583629,-1.305045,-2.017127,-1.729537
D,0.749017,0.429801,1.450081,0.360739,-1.320002,-0.494593


<p>Sorting by an axis:</p>

In [104]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-11-12,0.749017,1.466787,0.248177,-0.456684
2016-11-13,0.429801,1.250049,-0.472189,0.050997
2016-11-14,1.450081,1.583629,0.87946,0.675326
2016-11-15,0.360739,-1.305045,-1.448146,-0.763748
2016-11-16,-1.320002,-2.017127,1.187455,0.399177
2016-11-17,-0.494593,-1.729537,0.158823,1.057221


<p>Sorting by values:</p>

In [105]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-17,1.057221,0.158823,-1.729537,-0.494593
2016-11-12,-0.456684,0.248177,1.466787,0.749017
2016-11-14,0.675326,0.87946,1.583629,1.450081
2016-11-16,0.399177,1.187455,-2.017127,-1.320002


## Selection

<p>See the indexing documentation Indexing and Selecting Data and MultiIndex/Advanced Indexing.</p>

### Getting

Selecting a single column, which yields a Series, equivalent to df.A:

In [106]:
df['A']

2016-11-12   -0.456684
2016-11-13    0.050997
2016-11-14    0.675326
2016-11-15   -0.763748
2016-11-16    0.399177
2016-11-17    1.057221
Freq: D, Name: A, dtype: float64

<p>Selecting via [ ], which slices the rows.</p>

In [107]:
df[0:3]

Unnamed: 0,A,B,C,D
2016-11-12,-0.456684,0.248177,1.466787,0.749017
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081


In [108]:
df['20161112':'20161114']

Unnamed: 0,A,B,C,D
2016-11-12,-0.456684,0.248177,1.466787,0.749017
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081


### Selection by Label

For getting a cross section using a label:

In [109]:
df.loc[dates[0]]

A   -0.456684
B    0.248177
C    1.466787
D    0.749017
Name: 2016-11-12 00:00:00, dtype: float64

Selecting on a multi-axis by label:

In [110]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-11-12,-0.456684,0.248177
2016-11-13,0.050997,-0.472189
2016-11-14,0.675326,0.87946
2016-11-15,-0.763748,-1.448146
2016-11-16,0.399177,1.187455
2016-11-17,1.057221,0.158823


Showing label slicing, both endpoints are included:

In [111]:
df.loc['20161112':'20161114',['A','B']]

Unnamed: 0,A,B
2016-11-12,-0.456684,0.248177
2016-11-13,0.050997,-0.472189
2016-11-14,0.675326,0.87946


Reduction in the dimensions of the returned object:

In [112]:
df.loc['20161112',['A','B']]

A   -0.456684
B    0.248177
Name: 2016-11-12 00:00:00, dtype: float64

For getting a scalar value:

In [113]:
df.loc[dates[0],'A']

-0.45668358837636758

For getting fast access to a scalar value (equivalent to the prior method):

In [114]:
df.at[dates[0],'A']

-0.45668358837636758

### Selection by Position

Select via the position of the passed integers:

In [115]:
df.iloc[3]

A   -0.763748
B   -1.448146
C   -1.305045
D    0.360739
Name: 2016-11-15 00:00:00, dtype: float64

By integer slices, acting similar to the numpy/python style:

In [116]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2016-11-15,-0.763748,-1.448146
2016-11-16,0.399177,1.187455


By lists of integer position locations, similar to the numpy/python style:

In [117]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2016-11-13,0.050997,1.250049
2016-11-14,0.675326,1.583629
2016-11-16,0.399177,-2.017127


For slicing rows explicitly:

In [118]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081


For slicing columns explicitly:

In [119]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2016-11-12,0.248177,1.466787
2016-11-13,-0.472189,1.250049
2016-11-14,0.87946,1.583629
2016-11-15,-1.448146,-1.305045
2016-11-16,1.187455,-2.017127
2016-11-17,0.158823,-1.729537


For getting a value explicitly:

In [120]:
df.iloc[1,1]

-0.47218892968576975

For getting fast access to a scalar (equivalent to the prior method):

In [121]:
df.iat[1,1]

-0.47218892968576975

### Boolean Indexing

Using a single column's values to select data:

In [122]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2016-11-13,0.050997,-0.472189,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081
2016-11-16,0.399177,1.187455,-2.017127,-1.320002
2016-11-17,1.057221,0.158823,-1.729537,-0.494593


A where operation for getting:

In [123]:
df[df > 0]

Unnamed: 0,A,B,C,D
2016-11-12,,0.248177,1.466787,0.749017
2016-11-13,0.050997,,1.250049,0.429801
2016-11-14,0.675326,0.87946,1.583629,1.450081
2016-11-15,,,,0.360739
2016-11-16,0.399177,1.187455,,
2016-11-17,1.057221,0.158823,,


Using the isin() method for filtering:

In [124]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2016-11-12,-0.456684,0.248177,1.466787,0.749017,one
2016-11-13,0.050997,-0.472189,1.250049,0.429801,one
2016-11-14,0.675326,0.87946,1.583629,1.450081,two
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739,three
2016-11-16,0.399177,1.187455,-2.017127,-1.320002,four
2016-11-17,1.057221,0.158823,-1.729537,-0.494593,three


In [125]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2016-11-14,0.675326,0.87946,1.583629,1.450081,two
2016-11-16,0.399177,1.187455,-2.017127,-1.320002,four


### Setting

Setting a new column automatically aligns the data by the indexes:

In [126]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20161112', periods=6))
s1

2016-11-12    1
2016-11-13    2
2016-11-14    3
2016-11-15    4
2016-11-16    5
2016-11-17    6
Freq: D, dtype: int64

In [127]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2016-11-12,-0.456684,0.248177,1.466787,0.749017,1
2016-11-13,0.050997,-0.472189,1.250049,0.429801,2
2016-11-14,0.675326,0.87946,1.583629,1.450081,3
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739,4
2016-11-16,0.399177,1.187455,-2.017127,-1.320002,5
2016-11-17,1.057221,0.158823,-1.729537,-0.494593,6


Setting values by label:

In [128]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.248177,1.466787,0.749017,1
2016-11-13,0.050997,-0.472189,1.250049,0.429801,2
2016-11-14,0.675326,0.87946,1.583629,1.450081,3
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739,4
2016-11-16,0.399177,1.187455,-2.017127,-1.320002,5
2016-11-17,1.057221,0.158823,-1.729537,-0.494593,6


Setting values by position:

In [129]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,1.466787,0.749017,1
2016-11-13,0.050997,-0.472189,1.250049,0.429801,2
2016-11-14,0.675326,0.87946,1.583629,1.450081,3
2016-11-15,-0.763748,-1.448146,-1.305045,0.360739,4
2016-11-16,0.399177,1.187455,-2.017127,-1.320002,5
2016-11-17,1.057221,0.158823,-1.729537,-0.494593,6


Setting by assigning with a numpy array:

In [130]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,1.466787,5,1
2016-11-13,0.050997,-0.472189,1.250049,5,2
2016-11-14,0.675326,0.87946,1.583629,5,3
2016-11-15,-0.763748,-1.448146,-1.305045,5,4
2016-11-16,0.399177,1.187455,-2.017127,5,5
2016-11-17,1.057221,0.158823,-1.729537,5,6


In [131]:
# Make DataFrame df match tutorial df
df.loc[:,'F'] = [np.nan, 1,2,3,4,5]
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,1.466787,5,
2016-11-13,0.050997,-0.472189,1.250049,5,1.0
2016-11-14,0.675326,0.87946,1.583629,5,2.0
2016-11-15,-0.763748,-1.448146,-1.305045,5,3.0
2016-11-16,0.399177,1.187455,-2.017127,5,4.0
2016-11-17,1.057221,0.158823,-1.729537,5,5.0


A where operation with setting:

In [132]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-1.466787,-5,
2016-11-13,-0.050997,-0.472189,-1.250049,-5,-1.0
2016-11-14,-0.675326,-0.87946,-1.583629,-5,-2.0
2016-11-15,-0.763748,-1.448146,-1.305045,-5,-3.0
2016-11-16,-0.399177,-1.187455,-2.017127,-5,-4.0
2016-11-17,-1.057221,-0.158823,-1.729537,-5,-5.0


## Missing Data

pandas primarily (?!) uses the value np.nan to represent missing data.  
It is by default not included in computations.  


Reindexing allows you to change/add/delete the index on a specified axis.  
This returns a copy of the data.  

In [133]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,1.466787,5,,
2016-11-13,0.050997,-0.472189,1.250049,5,1.0,
2016-11-14,0.675326,0.87946,1.583629,5,2.0,
2016-11-15,-0.763748,-1.448146,-1.305045,5,3.0,


In [134]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,1.466787,5,,1.0
2016-11-13,0.050997,-0.472189,1.250049,5,1.0,1.0
2016-11-14,0.675326,0.87946,1.583629,5,2.0,
2016-11-15,-0.763748,-1.448146,-1.305045,5,3.0,


To drop any rows that have missing data:

In [135]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2016-11-13,0.050997,-0.472189,1.250049,5,1.0,1.0


In [136]:
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,1.466787,5,,1.0
2016-11-13,0.050997,-0.472189,1.250049,5,1.0,1.0
2016-11-14,0.675326,0.87946,1.583629,5,2.0,
2016-11-15,-0.763748,-1.448146,-1.305045,5,3.0,


Filling missing data:

In [137]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,1.466787,5,5.0,1.0
2016-11-13,0.050997,-0.472189,1.250049,5,1.0,1.0
2016-11-14,0.675326,0.87946,1.583629,5,2.0,5.0
2016-11-15,-0.763748,-1.448146,-1.305045,5,3.0,5.0


To get the boolean mask where values are nan:

In [138]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,False,False,False,False,True,False
2016-11-13,False,False,False,False,False,False
2016-11-14,False,False,False,False,False,True
2016-11-15,False,False,False,False,False,True
