In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'index':['A','B','C','D'],
                   'col1':[10,20,30,40],
                   'col2':[1.0,0.9,0.8,0.7],
                   'col3':['a','b','c','d'],
                   'col4':[12.3, 45.6, 78.9, 123.4]})
df

Unnamed: 0,col1,col2,col3,col4,index
0,10,1.0,a,12.3,A
1,20,0.9,b,45.6,B
2,30,0.8,c,78.9,C
3,40,0.7,d,123.4,D


### create index
#### Method 1

In [80]:
df1 = df.copy()
df1.index = df1['index']
del df1['index']
df1

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,10,1.0,a,12.3
B,20,0.9,b,45.6
C,30,0.8,c,78.9
D,40,0.7,d,123.4


#### method 2

In [81]:
df2 = df.set_index(['index'])
df2

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,10,1.0,a,12.3
B,20,0.9,b,45.6
C,30,0.8,c,78.9
D,40,0.7,d,123.4


In [82]:
df = df.set_index(['index'])

### slice rows in reverse order

In [83]:
df['D':'A':-1]

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D,40,0.7,d,123.4
C,30,0.8,c,78.9
B,20,0.9,b,45.6
A,10,1.0,a,12.3


#### sort index

In [84]:
df['D':'A':-1].sort_index()

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,10,1.0,a,12.3
B,20,0.9,b,45.6
C,30,0.8,c,78.9
D,40,0.7,d,123.4


### vectorized methods

When performance is paramount, you should avoid using .apply() and .map() because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 4 columns):
col1    4 non-null int64
col2    4 non-null float64
col3    4 non-null object
col4    4 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 320.0+ bytes


In [86]:
# select columns in numeric data types
# then, convert to dozen units
df.select_dtypes(include=['float64', 'int64']).floordiv(12)

Unnamed: 0_level_0,col1,col2,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,0.0,1.0
B,1,0.0,3.0
C,2,0.0,6.0
D,3,0.0,10.0


In [87]:
# the same as:
df.select_dtypes(include=['float64', 'int64']).apply(lambda x: x//12)

Unnamed: 0_level_0,col1,col2,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,0.0,1.0
B,1,0.0,3.0
C,2,0.0,6.0
D,3,0.0,10.0


### map()

#### apply functions on index

In [88]:
df.index.map(str.lower)

Index(['a', 'b', 'c', 'd'], dtype='object', name='index')

#### apply functions on columns

In [89]:
mapping = {'a':'item a', 'b':'item b', 'c':'item c', 'd':'item d'}
df['item'] = df['col3'].map(mapping)
df

Unnamed: 0_level_0,col1,col2,col3,col4,item
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,10,1.0,a,12.3,item a
B,20,0.9,b,45.6,item b
C,30,0.8,c,78.9,item c
D,40,0.7,d,123.4,item d


### categorical data

Advantages using dtype of category: less memory and faster operation like groupby()

In [90]:
df['item'].unique()

array(['item a', 'item b', 'item c', 'item d'], dtype=object)

In [93]:
df['item_category'] = df['item'].astype('category')

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 6 columns):
col1             4 non-null int64
col2             4 non-null float64
col3             4 non-null object
col4             4 non-null float64
item             4 non-null object
item_category    4 non-null category
dtypes: category(1), float64(2), int64(1), object(2)
memory usage: 548.0+ bytes


In [96]:
# Ordered category
df['item_ordered'] = pd.Categorical(values=df.item, 
                                    categories=['item a', 'item b', 'item c', 'item d'],
                                    ordered = True)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 7 columns):
col1             4 non-null int64
col2             4 non-null float64
col3             4 non-null object
col4             4 non-null float64
item             4 non-null object
item_category    4 non-null category
item_ordered     4 non-null category
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 744.0+ bytes


In [98]:
df.item_ordered

index
A    item a
B    item b
C    item c
D    item d
Name: item_ordered, dtype: category
Categories (4, object): [item a < item b < item c < item d]