##### Most used commands

In [2]:
import pandas as pd

## display all columns and rows, no trimming
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## display all data inside the cells
pd.set_option('display.max_colwidth', -1)

In [3]:
df = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20130102'),\
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),\
                   'D': [3] * 4,\
                   'E': pd.Categorical(["test", "train", "test", "train"]),\
                   'F': 'foo'})
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [4]:
## top 5 rows
df.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [5]:
## last 5 rows
df.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
## random sample
df.sample(2)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
1,1.0,2013-01-02,1.0,3,train,foo


In [7]:
## display the index
df.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [8]:
## display the columns
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [9]:
## display the data type for each column
df.dtypes

A    float64       
B    datetime64[ns]
C    float32       
D    int64         
E    category      
F    object        
dtype: object

In [10]:
## describe the dataframe
df.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [11]:
## transpose dataframe
df.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [14]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [15]:
## sort dataframe by certain columns

df.sort_values(by=['A'], axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
df.sort_values(by='C')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [16]:
# sort by A and B, in descending and ascending order
df.sort_values(by=['A', 'B'], axis=0, ascending=[False, True])

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [17]:
## sort by an axis; to be used when index holds something like date
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


In [19]:
## display column A
df.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [20]:
# same as above
df['A']

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [21]:
## slice dataframe by row index number; not the actual index
df[1:3]

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [22]:
## select a row
df.loc[0]

A    1                  
B    2013-01-02 00:00:00
C    1                  
D    3                  
E    test               
F    foo                
Name: 0, dtype: object

In [23]:
## select a row and some columns
df.loc[0, ['A', 'B']]

A    1                  
B    2013-01-02 00:00:00
Name: 0, dtype: object

In [24]:
## select all rows and some columns
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02
3,1.0,2013-01-02


In [25]:
## get a value/ Scalar
df.loc[0, 'A']

1.0

In [None]:
## ILOC vs LOC
"""loc is label-based, which means that you have to specify rows and columns based
on their row and column labels. iloc is integer index based, so you have to specify 
rows and columns by their integer index like you did in the previous exercise."""


In [26]:
## using ILOC
df.iloc[3]

A    1                  
B    2013-01-02 00:00:00
C    1                  
D    3                  
E    train              
F    foo                
Name: 3, dtype: object

In [27]:
## specific rows/ columns using iloc
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
3,1.0,2013-01-02


In [28]:
## specific cell
df.iloc[1, 1]

Timestamp('2013-01-02 00:00:00')

In [29]:
## all rows, specific columns
df.iloc[:, 1:3]

Unnamed: 0,B,C
0,2013-01-02,1.0
1,2013-01-02,1.0
2,2013-01-02,1.0
3,2013-01-02,1.0


In [30]:
## all columns, specific rows
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [31]:
## faster access using iat
## Access a single value for a row/column pair by integer position.
## Similar to iloc, in that both provide integer-based lookups. Use iat if you 
## only need to get or set a single value in a DataFrame or Series.

df.iat[1, 1]

Timestamp('2013-01-02 00:00:00')

In [32]:
## label-location based indexer, with integer position fallback
## .ix supports mixed integer and label based access. It is primarily label based, 
## but will fall back to integer positional access unless the corresponding axis is 
## of integer type. .ix is the most general and will support any of the inputs in .loc 
## and .iloc. .ix also supports floating point label schemes. .ix is exceptionally
## useful when dealing with mixed positional and label based hierachical indexes.
df.ix[1,1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


Timestamp('2013-01-02 00:00:00')

In [33]:
## boolean indexing on one column value
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [34]:
## boolean indexing on multiple column values
df[(df['A'] > 0) & (df['C']==1.0)]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [37]:
## Selecting values from a DataFrame where a boolean condition is met
df[df[['A','C']] > 0]

Unnamed: 0,A,B,C,D,E,F
0,1.0,NaT,1.0,,,
1,1.0,NaT,1.0,,,
2,1.0,NaT,1.0,,,
3,1.0,NaT,1.0,,,


In [None]:
## can also use the following if all columns are dtype int/float
df[df > 0]

In [38]:
## isin method to filter
df[df['E'].isin(['test'])]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [39]:
## adding a new column
s1 = pd.Series([1, 2, 3, 4])
df['G'] = s1
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,1
1,1.0,2013-01-02,1.0,3,train,foo,2
2,1.0,2013-01-02,1.0,3,test,foo,3
3,1.0,2013-01-02,1.0,3,train,foo,4


In [40]:
## setting value of a column to a new value
## must be at least same dimension
df['F'] = ['foo1', 'foo2', 'foo3', 'foo4']
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,1.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [45]:
## what happens when passed more values
try:
    df['F'] = ['foo1', 'foo2', 'foo3', 'foo4', 'foo5', 'foo6']
except Exception as e:
    print('Error')
df

Error


Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,1.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [46]:
## what happens when passed less values
try:
    df['F'] = ['foo1', 'foo2']
except Exception as e:
    print('Error')
df

Error


Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,1.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [47]:
## setting a value of a cell at a certain row/ column
df.at[0, 'A'] = 10.0
df

Unnamed: 0,A,B,C,D,E,F,G
0,10.0,2013-01-02,1.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,1.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [48]:
## setting a value of a cell at a certain row/ column labels
df.at[0, 'C'] = 100.0
df

Unnamed: 0,A,B,C,D,E,F,G
0,10.0,2013-01-02,100.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,1.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [51]:
## setting value by position
df.iat[2, 2] = 5.0
df

Unnamed: 0,A,B,C,D,E,F,G
0,10.0,2013-01-02,100.0,3,test,foo1,1
1,1.0,2013-01-02,1.0,3,train,foo2,2
2,1.0,2013-01-02,5.0,3,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [52]:
## assigning values via loc
df.loc[:, 'D'] = [5] * len(df)
df

Unnamed: 0,A,B,C,D,E,F,G
0,10.0,2013-01-02,100.0,5,test,foo1,1
1,1.0,2013-01-02,1.0,5,train,foo2,2
2,1.0,2013-01-02,5.0,5,test,foo3,3
3,1.0,2013-01-02,1.0,5,train,foo4,4


In [56]:
## assigning values via iloc
df.iloc[:, 3] = range(len(df))
df

Unnamed: 0,A,B,C,D,E,F,G
0,10.0,2013-01-02,100.0,0,test,foo1,1
1,1.0,2013-01-02,1.0,1,train,foo2,2
2,1.0,2013-01-02,5.0,2,test,foo3,3
3,1.0,2013-01-02,1.0,3,train,foo4,4


In [57]:
## delete a column
df.drop('B', axis=1, inplace=True)

In [58]:
df

Unnamed: 0,A,C,D,E,F,G
0,10.0,100.0,0,test,foo1,1
1,1.0,1.0,1,train,foo2,2
2,1.0,5.0,2,test,foo3,3
3,1.0,1.0,3,train,foo4,4
