In [2]:
import pandas as pd
import numpy as np

# Object Creation

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range('20130101', periods=6)

dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns = list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,-0.497638,0.247795,0.503333,1.613688
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754


In [6]:
df2 = pd.DataFrame(
    {
        'A' : 1.0,
        'B' : pd.Timestamp('20130102'),
        'C' : pd.Series(1, index = list(range(4)), dtype='float32'),
        'D' : np.array([3] * 4, dtype = 'int32'),
        'E' : pd.Categorical(['test', 'train', 'test', 'train']),
        'F' : 'foo',
    })

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
df2.<TAB>  # noqa : E225, E999

SyntaxError: invalid syntax (<ipython-input-14-5ed65c941e2a>, line 1)

# Viewing Data

In [15]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,-0.497638,0.247795,0.503333,1.613688


In [16]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,-0.497638,0.247795,0.503333,1.613688
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754


In [17]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
df.to_numpy()

array([[-0.16590266, -0.6104957 ,  0.35787625, -0.09943505],
       [-0.51440667,  0.39319556, -1.4148143 , -0.81775341],
       [-0.59522033, -1.77477512, -0.94977488, -0.78957684],
       [ 0.24424318,  1.65188909,  0.91153974,  1.4310169 ],
       [-0.4976378 ,  0.24779524,  0.50333324,  1.61368835],
       [ 0.81117182, -0.40492511, -0.61611661, -0.73975369]])

In [20]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

### Note
DataFrame.to_numpy() does not include the index or column labels in the output.

### describe() show a quick statistic summary of data

In [21]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.119625,-0.082886,-0.201326,0.099698
std,0.552424,1.148043,0.922226,1.135027
min,-0.59522,-1.774775,-1.414814,-0.817753
25%,-0.510214,-0.559103,-0.86636,-0.777121
50%,-0.33177,-0.078565,-0.12912,-0.419594
75%,0.141707,0.356845,0.466969,1.048404
max,0.811172,1.651889,0.91154,1.613688


### Transposing data

In [22]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.165903,-0.514407,-0.59522,0.244243,-0.497638,0.811172
B,-0.610496,0.393196,-1.774775,1.651889,0.247795,-0.404925
C,0.357876,-1.414814,-0.949775,0.91154,0.503333,-0.616117
D,-0.099435,-0.817753,-0.789577,1.431017,1.613688,-0.739754


### Sorting by axis

In [24]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.099435,0.357876,-0.610496,-0.165903
2013-01-02,-0.817753,-1.414814,0.393196,-0.514407
2013-01-03,-0.789577,-0.949775,-1.774775,-0.59522
2013-01-04,1.431017,0.91154,1.651889,0.244243
2013-01-05,1.613688,0.503333,0.247795,-0.497638
2013-01-06,-0.739754,-0.616117,-0.404925,0.811172


### Sorting by values

In [25]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754
2013-01-05,-0.497638,0.247795,0.503333,1.613688
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-04,0.244243,1.651889,0.91154,1.431017


# Selection

## Getting

Select a single column, which yields a Series, equivalent to df.A

In [26]:
df['A']

2013-01-01   -0.165903
2013-01-02   -0.514407
2013-01-03   -0.595220
2013-01-04    0.244243
2013-01-05   -0.497638
2013-01-06    0.811172
Freq: D, Name: A, dtype: float64

Select via[], which slices the rows.

In [27]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577


In [28]:
df["20130102" : '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-04,0.244243,1.651889,0.91154,1.431017


## Select by label

In [29]:
df.loc[dates[0]]

A   -0.165903
B   -0.610496
C    0.357876
D   -0.099435
Name: 2013-01-01 00:00:00, dtype: float64

In [30]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.165903,-0.610496
2013-01-02,-0.514407,0.393196
2013-01-03,-0.59522,-1.774775
2013-01-04,0.244243,1.651889
2013-01-05,-0.497638,0.247795
2013-01-06,0.811172,-0.404925


In [31]:
df.loc["20130102":"20130104", ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.514407,0.393196
2013-01-03,-0.59522,-1.774775
2013-01-04,0.244243,1.651889


In [32]:
df.loc["20130102", ["A", "B"]]

A   -0.514407
B    0.393196
Name: 2013-01-02 00:00:00, dtype: float64

In [34]:
df.loc[dates[0], "A"]

-0.16590265753799485

In [35]:
df.at[dates[0], "A"]

-0.16590265753799485

## Selection by position

In [41]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,-0.497638,0.247795,0.503333,1.613688
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754


In [36]:
df.iloc[3]

A    0.244243
B    1.651889
C    0.911540
D    1.431017
Name: 2013-01-04 00:00:00, dtype: float64

In [39]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.244243,1.651889
2013-01-05,-0.497638,0.247795


In [40]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-0.514407,-1.414814
2013-01-03,-0.59522,-0.949775
2013-01-05,-0.497638,0.503333


In [42]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577


In [43]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.610496,0.357876
2013-01-02,0.393196,-1.414814
2013-01-03,-1.774775,-0.949775
2013-01-04,1.651889,0.91154
2013-01-05,0.247795,0.503333
2013-01-06,-0.404925,-0.616117


In [44]:
df.iloc[1, 1]

0.393195564459059

In [45]:
df.iat[1, 1]

0.393195564459059

## Boolean indexing

In [47]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,-0.497638,0.247795,0.503333,1.613688
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754


In [46]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754


In [48]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.357876,
2013-01-02,,0.393196,,
2013-01-03,,,,
2013-01-04,0.244243,1.651889,0.91154,1.431017
2013-01-05,,0.247795,0.503333,1.613688
2013-01-06,0.811172,,,


In [49]:
df2 = df.copy()

In [50]:
df2["E"] = ["one", 'one', 'two', 'three', 'four','three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.165903,-0.610496,0.357876,-0.099435,one
2013-01-02,-0.514407,0.393196,-1.414814,-0.817753,one
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577,two
2013-01-04,0.244243,1.651889,0.91154,1.431017,three
2013-01-05,-0.497638,0.247795,0.503333,1.613688,four
2013-01-06,0.811172,-0.404925,-0.616117,-0.739754,three


In [51]:
df2[df2["E"].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.59522,-1.774775,-0.949775,-0.789577,two
2013-01-05,-0.497638,0.247795,0.503333,1.613688,four


## Setting

In [52]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [53]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [54]:
df['F'] = s1

In [55]:
df.at[dates[0], 'A'] = 0

In [56]:
df.iat[0, 1] = 0

In [57]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [58]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.357876,5,
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0
2013-01-04,0.244243,1.651889,0.91154,5,3.0
2013-01-05,-0.497638,0.247795,0.503333,5,4.0
2013-01-06,0.811172,-0.404925,-0.616117,5,5.0


In [60]:
df2 = df.copy()

In [61]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.357876,5,
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0
2013-01-04,0.244243,1.651889,0.91154,5,3.0
2013-01-05,-0.497638,0.247795,0.503333,5,4.0
2013-01-06,0.811172,-0.404925,-0.616117,5,5.0


In [62]:
df2[df2 > 0] = -df2

In [63]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.357876,-5,
2013-01-02,-0.514407,-0.393196,-1.414814,-5,-1.0
2013-01-03,-0.59522,-1.774775,-0.949775,-5,-2.0
2013-01-04,-0.244243,-1.651889,-0.91154,-5,-3.0
2013-01-05,-0.497638,-0.247795,-0.503333,-5,-4.0
2013-01-06,-0.811172,-0.404925,-0.616117,-5,-5.0


## Missing data

In [67]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])

In [68]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.357876,5,,
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0,
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0,
2013-01-04,0.244243,1.651889,0.91154,5,3.0,


In [69]:
df1.loc[dates[0] : dates[1], "E"] = 1

In [70]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.357876,5,,1.0
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0,1.0
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0,
2013-01-04,0.244243,1.651889,0.91154,5,3.0,


In [71]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0,1.0


In [72]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.357876,5,5.0,1.0
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0,1.0
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0,5.0
2013-01-04,0.244243,1.651889,0.91154,5,3.0,5.0


In [73]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Operations

### Stats

Operations in general exclude missing data.
Performing a descriptive statistic:

In [74]:
df.mean()

A   -0.091975
B    0.018863
C   -0.201326
D    5.000000
F    3.000000
dtype: float64

Same operation on the other axis:

In [75]:
df.mean(1)

2013-01-01    1.339469
2013-01-02    0.892795
2013-01-03    0.736046
2013-01-04    2.161534
2013-01-05    1.850698
2013-01-06    1.958026
Freq: D, dtype: float64

In [77]:
s = pd.Series([1,3,5,np.nan,6,8], index = dates).shift(2)

In [78]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [79]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.357876,5,
2013-01-02,-0.514407,0.393196,-1.414814,5,1.0
2013-01-03,-0.59522,-1.774775,-0.949775,5,2.0
2013-01-04,0.244243,1.651889,0.91154,5,3.0
2013-01-05,-0.497638,0.247795,0.503333,5,4.0
2013-01-06,0.811172,-0.404925,-0.616117,5,5.0


In [80]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-1.59522,-2.774775,-1.949775,4.0,1.0
2013-01-04,-2.755757,-1.348111,-2.08846,2.0,0.0
2013-01-05,-5.497638,-4.752205,-4.496667,0.0,-1.0
2013-01-06,,,,,


### Apply

Applying functions to the data: