Pandas Index and Selecting Data

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

#select all rows for a specific column
print(df.loc[:,'A'],'\n')

# Select all rows for multiple columns, say list[]
print(df.loc[:,['A','C']],'\n')

# Select few rows for multiple columns, say list[]
print(df.loc[['a','b','f','h'],['A','C']])

a   -0.660480
b    1.495407
c    0.224211
d    2.712920
e   -0.037679
f   -0.604411
g   -1.570894
h    0.073878
Name: A, dtype: float64 

          A         C
a -0.660480 -0.272436
b  1.495407  1.480672
c  0.224211 -0.572076
d  2.712920 -0.610116
e -0.037679 -0.302579
f -0.604411  1.842179
g -1.570894  0.234154
h  0.073878  0.506091 

          A         C
a -0.660480 -0.272436
b  1.495407  1.480672
f -0.604411  1.842179
h  0.073878  0.506091


In [None]:
# Select range of rows for all columns
print(df.loc['a':'d'],'\n')

# For getting values with a boolean array
print(df.loc['a']>0,'\n')

# select all rows for a specific column
print(df.iloc[:4])

          A         B         C         D
a -0.660480 -0.223393 -0.272436  0.738605
b  1.495407  0.007590  1.480672  0.313712
c  0.224211  0.524557 -0.572076 -0.827286
d  2.712920 -0.513437 -0.610116  0.915804 

A    False
B    False
C    False
D     True
Name: a, dtype: bool 

          A         B         C         D
a -0.660480 -0.223393 -0.272436  0.738605
b  1.495407  0.007590  1.480672  0.313712
c  0.224211  0.524557 -0.572076 -0.827286
d  2.712920 -0.513437 -0.610116  0.915804


Integer and Index Slicing

In [None]:
# Integer slicing
print("Integer Slicing")
print(df.iloc[:4],'\n')
print(df.iloc[1:5, 2:4],'\n')

# Slicing through a list of values
print("Slicing through a list of values")
print(df.iloc[[1, 3, 5], [1, 3]],'\n')
print(df.iloc[1:3, :],'\n')
print(df.iloc[:,1:3])

Integer Slicing
          A         B         C         D
a -0.660480 -0.223393 -0.272436  0.738605
b  1.495407  0.007590  1.480672  0.313712
c  0.224211  0.524557 -0.572076 -0.827286
d  2.712920 -0.513437 -0.610116  0.915804 

          C         D
b  1.480672  0.313712
c -0.572076 -0.827286
d -0.610116  0.915804
e -0.302579 -0.424715 

Slicing through a list of values
          B         D
b  0.007590  0.313712
d -0.513437  0.915804
f  1.343966 -1.077907 

          A         B         C         D
b  1.495407  0.007590  1.480672  0.313712
c  0.224211  0.524557 -0.572076 -0.827286 

          B         C
a -0.223393 -0.272436
b  0.007590  1.480672
c  0.524557 -0.572076
d -0.513437 -0.610116
e  3.633740 -0.302579
f  1.343966  1.842179
g -0.456820  0.234154
h -1.638495  0.506091


Pandas DataFrame Sorting

In [None]:
unsorted_df=pd.DataFrame(np.random.randn(8,2),index=[1,4,6,2,3,5,0,7],columns=['col2','col1'])
print("Unsorted DataFrame\n",unsorted_df)

# Sorting index in ascending order
sorted_df=unsorted_df.sort_index()
print('\n',sorted_df,'\n')

# Sorting index in descending order
sorted_df = unsorted_df.sort_index(ascending=False)
print(sorted_df)

Unsorted DataFrame
        col2      col1
1 -0.722302 -1.439385
4 -1.428118 -0.196661
6  2.044815 -0.305100
2  0.014601 -0.667543
3 -0.881335 -0.073252
5 -0.327825 -0.451010
0 -0.234624 -0.605275
7  1.034371 -0.588796

        col2      col1
0 -0.234624 -0.605275
1 -0.722302 -1.439385
2  0.014601 -0.667543
3 -0.881335 -0.073252
4 -1.428118 -0.196661
5 -0.327825 -0.451010
6  2.044815 -0.305100
7  1.034371 -0.588796 

       col2      col1
7  1.034371 -0.588796
6  2.044815 -0.305100
5 -0.327825 -0.451010
4 -1.428118 -0.196661
3 -0.881335 -0.073252
2  0.014601 -0.667543
1 -0.722302 -1.439385
0 -0.234624 -0.605275


In [None]:
# Sorting the column index
sorted_df=unsorted_df.sort_index(axis=1)
print(sorted_df)

# Sorting by column
unsorted_df2 = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df2.sort_values(by='col1')
print('\n',sorted_df,'\n')

# Sorting by multiple columns
sorted_df = unsorted_df2.sort_values(by=['col1','col2'])
print(sorted_df,'\n')

# Using specific kind of sorting algorithm
sorted_df = unsorted_df2.sort_values(by='col1' ,kind='mergesort')
print(sorted_df)

       col1      col2
1 -1.439385 -0.722302
4 -0.196661 -1.428118
6 -0.305100  2.044815
2 -0.667543  0.014601
3 -0.073252 -0.881335
5 -0.451010 -0.327825
0 -0.605275 -0.234624
7 -0.588796  1.034371

    col1  col2
1     1     3
2     1     2
3     1     4
0     2     1 

   col1  col2
2     1     2
1     1     3
3     1     4
0     2     1 

   col1  col2
1     1     3
2     1     2
3     1     4
0     2     1
