In [2]:
# 5.2 Essential Functionality

#1 Reindexing

import numpy as np
import pandas as pd
from pandas import Series, DataFrame


In [3]:
# Use reindex

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64


In [4]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [5]:
obj3 = pd.Series(['bule', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)


0      bule
2    purple
4    yellow
dtype: object


In [7]:
# Can use method: ffill which forward-fills the values:
print('---')
obj4 = obj3.reindex(range(6), method='ffill')
print(obj3)    #notice that obj3 no change
print('---')
obj4

---
0      bule
2    purple
4    yellow
dtype: object
---


0      bule
1      bule
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [9]:
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

frame


Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)

   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


In [11]:
# Change columns index
states = ['Texas', 'Utah', 'California']
frame3 = frame.reindex(columns=states)
print(frame3)



   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [16]:
#5.2.2 Dropping Entries from an Axis


obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [21]:
# Remove rows
# Use: drop

print(obj)    # notice that obj is not changed, just return a new object
print('--')

new_obj = obj.drop('c')
print(new_obj)

print('--')
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
--
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
--


a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [22]:
obj1= obj.drop(['d', 'c'])
print('--')
print(obj1)


print('--')
print(obj)    # notice that obj is not changed, just return a new object

--
a    0.0
b    1.0
e    4.0
dtype: float64
--
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


In [23]:
# can manipulate an object in-place without returning a new object:
# notice that obj is changed, directly change the object
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [None]:
# Be careful with the inplace, as it destroys any data that is dropped.

In [25]:
data = pd.DataFrame(np.arange(16).reshape(4, 4),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
# Drop rows
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [27]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [29]:
# Drop columns

In [30]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [31]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [33]:
# 5.2.3 Indexing, Selection, and Filtering
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

print(obj)
print('------')

print(obj[1])
print(obj[2:4])

print(obj[[1, 3]])
print(obj[obj < 2])


a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
---
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64


In [35]:
print(obj)
print()
print(obj['b'])
print('------')
print(obj[['b', 'a', 'd']])
print('------')
print(obj['b':'c'])  # notice that include c collumn, not like python slice
print('------')
obj['b':'c'] = 5   # notice that include c collumn
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

5.0
------
b    5.0
a    0.0
d    3.0
dtype: float64
------
b    5.0
c    5.0
dtype: float64
------


a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [43]:

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

print(data)
print('------')
# select
print(data['two'])
print('------')
print(data[['three', 'one']])
print('------')
print(data[:2])
print(data[1:3])

data

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
------
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
------
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
------
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [44]:
# Filter
print(data)
print('----------')

print(data < 5)
print('------')

d1 = data[data['three'] > 5]
print(d1)

print('------')
data[data < 5] = 0
data

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
------
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
------
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
------


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
#5.2.3 Selection with loc and iloc

# Select a subset of the rows and columns from a DataFrame with NumPy-like notation 
# using either axis labels (loc) or integers (iloc).

In [45]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [58]:
# select columns

res = data['two']
print(res)
print('----------')

res = data[['two','three']]
print(res)

print('----------')
print(data.three)

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
----------
          two  three
Ohio        1      2
Colorado    5      6
Utah        9     10
New York   13     14
Ohio         2
Colorado     6
Utah        10
New York    14
Name: three, dtype: int32


In [59]:
# Use .loc to select rows and columns by label:
res = data.loc['Colorado']
print(res)
print('----------')
res = data.loc['Colorado', 'two']
print(res)
print('----------')

res = data.loc[['Ohio','Colorado'], 'two']
print(res)
print('----------')

res = data.loc['Colorado', ['two', 'three']]
print(res)
print('----------')

res = data.loc[['Ohio','Colorado'], ['two', 'three']]
print(res)

data

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int32
----------
5
----------
Ohio        1
Colorado    5
Name: two, dtype: int32
----------
two      5
three    6
Name: Colorado, dtype: int32
----------
          two  three
Ohio        1      2
Colorado    5      6


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [67]:
# Use .iloc to select rows and columns by integers:

# Select the third row
res = data.iloc[2]
print(res)
print('----------')


# Select the second and third row
res = data.iloc[1:3]
print(res)
print('----------')


one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
----------
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
----------


In [64]:
# loc vs. iloc

res = data.loc[['Ohio','Colorado'], 'two']
print(res)
print('----------')

res = data.iloc[[0,1], 1]
print(res)
print('----------')

data

Ohio        1
Colorado    5
Name: two, dtype: int32
----------
Ohio        1
Colorado    5
Name: two, dtype: int32
----------


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [66]:
# loc vs. iloc

res = data.loc['Colorado', ['two', 'three']]
print(res)
print('----------')

res = data.iloc[1, [1, 2]]
print(res)
print('----------')

res = data.loc[['Ohio','Colorado'], ['two', 'three']]
print(res)
print('----------')

res = data.iloc[[0,1], [1, 2]]
print(res)


two      5
three    6
Name: Colorado, dtype: int32
----------
two      5
three    6
Name: Colorado, dtype: int32
----------
          two  three
Ohio        1      2
Colorado    5      6
----------
          two  three
Ohio        1      2
Colorado    5      6


In [68]:
# What is the output?
data.iloc[2, [3, 0, 1]]

In [71]:
# What is the output?
#data.iloc[1]

In [73]:
# What is the output?
#data.iloc[[1, 2], [3, 0, 1]]

In [75]:
# What is the output?
#data.loc[:'Utah', 'two']

In [51]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [None]:
# 5.2.5 Arithmetic and Data Alignment 


In [77]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
# respective index in the result will be the union of the index pairs
s1 + s2

a    9.4
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [78]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print('---')
print(df2)
print('---')

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
---
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
---


In [79]:
#DataFrame, alignment is performed on both the rows and the columns:
print(df1+df2)

            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


In [80]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

print(df1)
print('---')
print(df2)
print('---')
print(df1+df2)
print('---')
print(df1-df2)

   A
0  1
1  2
---
   B
0  3
1  4
---
    A   B
0 NaN NaN
1 NaN NaN
---
    A   B
0 NaN NaN
1 NaN NaN


In [92]:
# Page 150 Arithmetic methods with fill values 

df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), 
                   columns=list('abcd'))

df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), 
                   columns=list('abcde'))

df2.loc[1, 'b'] = np.nan

print(df1)
print('---')
print(df2)
print('---')

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
---


In [88]:
# <- df1,df2 not change, return new copy
res = df1+df2
print(res)

print('---')
res = df1.add(df2, fill_value = 0)
print(res)  
print('------')
res = df1.add(df2, fill_value = 100)
print(res)

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
---
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
------
       a      b      c      d      e
0    0.0    2.0    4.0    6.0  104.0
1    9.0  105.0   13.0   15.0  109.0
2   18.0   20.0   22.0   24.0  114.0
3  115.0  116.0  117.0  118.0  119.0


In [90]:

df2 = 1 / df1
# samse
df3 = df1.rdiv(1)

print(df1)    
print('---')
print(df2)
print('---')
print(df3)   # same as df2
print('---')


     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
---
       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
---


Table 5-5 
----------------
add, radd;  +
sub,rsub;  -
div,rdiv;  /
floordiv,rfloordiv; //
mul, rmul *
pow, rpow  **

In [94]:
print(df1)
print(df2)
df1.reindex(columns=df2.columns, fill_value=0)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [95]:
#Operations between DataFrame and Series

arr = np.arange(12.).reshape((3, 4))

print(arr)
print('---')
print(arr[0])
print('---')
print( arr - arr[0])   #broadcasting

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
---
[0. 1. 2. 3.]
---
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]


In [96]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print( frame)
print('---')

series = frame.iloc[0]
print(series)
print('---')
print(frame - series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
---
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
---
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [98]:
series3 = frame['d']
print(series3)

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64


print(frame)
print(series3)
print('---')

#sub : -
print(frame.sub(series3, axis='index'))

print(frame.sub(series3, axis = 0))  # same

print(frame.sub(series3, axis = 1)) # vs

In [None]:
# 5.2.6 Function Application and Mapping 

In [99]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print('------')

print(np.abs(frame))
print('-------')


               b         d         e
Utah    1.462837 -0.433308 -1.115920
Ohio   -1.155191  0.150412  2.010908
Texas   0.018757  0.670879 -2.461886
Oregon  0.628439 -0.399947  1.605309
------
               b         d         e
Utah    1.462837  0.433308  1.115920
Ohio    1.155191  0.150412  2.010908
Texas   0.018757  0.670879  2.461886
Oregon  0.628439  0.399947  1.605309
-------


In [101]:
# skip
f = lambda x: x.max() - x.min()
fd1 = frame.apply(f)
# vs.
fd2 = frame.apply(f, axis='columns')
print(fd1)
print('-------')
print(fd2)


b    2.618028
d    1.104187
e    4.472794
dtype: float64
-------
Utah      2.578757
Ohio      3.166099
Texas     3.132765
Oregon    2.005256
dtype: float64


In [91]:
#skip
def f(x): 
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

print(frame)
print('-------')

print(frame.apply(f))


format = lambda x: '%.2f' % x

print('-------')
print(frame.applymap(format) )

               b         d         e
Utah    0.610896  0.652326 -0.782994
Ohio   -0.609226  0.518462  0.303001
Texas   0.103609 -0.200211 -0.794206
Oregon -0.282736 -0.100765  0.021823
-------
            b         d         e
min -0.609226 -0.200211 -0.794206
max  0.610896  0.652326  0.303001
-------
            b      d      e
Utah     0.61   0.65  -0.78
Ohio    -0.61   0.52   0.30
Texas    0.10  -0.20  -0.79
Oregon  -0.28  -0.10   0.02


In [None]:
# 5.2.7 Sorting and Ranking 

sort_index

sort_values

In [102]:
# Sorting for dataframe

s1 = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

s2 = s1.sort_index()  # return a new object s2

print(s1)   # s1 still not sorted
print('-------')
print(s2)


d    0
a    1
b    2
c    3
dtype: int64
-------
a    1
b    2
c    3
d    0
dtype: int64


In [112]:
# Sorting index

fd1 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                     index=['BB', 'AA','CC'],
                     columns=['d', 'a', 'b', 'c'])

print(fd1)
print('--')

fd2 = fd1.sort_index()
print(fd2)
print('--')

fd2 = fd1.sort_index(axis=0,ascending=False )  # <- row
print(fd2)
print('--')


    d  a   b   c
BB  0  1   2   3
AA  4  5   6   7
CC  8  9  10  11
--
    d  a   b   c
AA  4  5   6   7
BB  0  1   2   3
CC  8  9  10  11
--
    d  a   b   c
CC  8  9  10  11
BB  0  1   2   3
AA  4  5   6   7
--


In [111]:
# vs. axis=1)   # <- columns
fd3 = fd1.sort_index(axis=1)   # <- columns
# vs. 
fd4 = fd1.sort_index(axis=1,ascending=False) # <- columns

print(fd3)
print('--')
print(fd4)

    a   b   c  d
BB  1   2   3  0
AA  5   6   7  4
CC  9  10  11  8
--
    d   c   b  a
BB  0   3   2  1
AA  4   7   6  5
CC  8  11  10  9


In [116]:
# . sort_values()
# Sort series
obj = pd.Series([4, 7, -3, 2])

print(obj.sort_values())

print(obj.sort_values(ascending=False))


2   -3
3    2
0    4
1    7
dtype: int64
1    7
0    4
3    2
2   -3
dtype: int64


In [117]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print(obj.sort_values())
print(obj.sort_values(ascending=False))

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64


In [124]:
frame = pd.DataFrame({ 'a': [0, 1, 0, 1],'b': [4, 7, -3, 2]})
print(frame)
print('------')

fd1 = frame.sort_values(by='b')
print(fd1)

print('------')
fd1 = frame.sort_values(by='b',ascending=False)
print(fd1)

   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
------
   a  b
2  0 -3
3  1  2
0  0  4
1  1  7
------
   a  b
1  1  7
0  0  4
3  1  2
2  0 -3


In [126]:
print(frame)
print('------')

fd2 = frame.sort_values(by=['a', 'b'])
print(fd2)

print('------')
fd2 = frame.sort_values(by=['a', 'b'],ascending=False)
print(fd2)

   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
------
   a  b
2  0 -3
0  0  4
3  1  2
1  1  7
------
   a  b
1  1  7
3  1  2
0  0  4
2  0 -3


Ranking assigns ranks from one through the number of valid data points in an array. 
The rank methods for Series and DataFrame are the place to look; 
by default rank breaks ties by assigning each group the mean rank:

In [128]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)

print('------')
print(obj.sort_values())

print('------')
print(obj.rank())


0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
------
1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64
------
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


In [131]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')


0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [135]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                      'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})

print(frame)
print('---')

fd = frame.rank(axis='columns') # rank in each column
print(fd) 
print('---')

fd = frame.rank(axis=1) # same, columns
print(fd) 

     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
---
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0
---
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


In [None]:
# Axis Indexes with Duplicate Labels 

In [136]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique

False

In [137]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

print(df)
print('---')
print(df.loc['b'])

          0         1         2
a  0.891271  1.119770  0.258931
a -0.067179 -2.500997 -0.197645
b  0.560410  0.761174  0.949222
b  0.741220  1.108114  1.249898
---
         0         1         2
b  0.56041  0.761174  0.949222
b  0.74122  1.108114  1.249898
