In [1]:
import numpy as np
import pandas as pd

# Consequences of Duplicate Labels 

In [2]:
# Create Series with duplicate index labels
s1 = pd.Series([0, 1, 2], index=['a', 'a', 'b'])
s1

a    0
a    1
b    2
dtype: int64

In [3]:
# Attempt to reindex a series with duplicate index labels
try:
    s1.reindex(['a', 'b', 'c'])
except ValueError as e:
    print("ValueError:", e)

ValueError: cannot reindex on an axis with duplicate labels


In [4]:
# Create DataFrame with duplicate column labels
df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B'])
df1

Unnamed: 0,A,A.1,B
0,0,1,2
1,3,4,5


In [5]:
# Select column with label 'B'
# Returns a Series
df1['B']

0    2
1    5
Name: B, dtype: int64

In [7]:
# Select columns with label 'A'
# Returns a DataFrame
df1['A']

Unnamed: 0,A,A.1
0,0,1
1,3,4


In [8]:
# Create DataFrame with duplicate row labels
df2 = pd.DataFrame({'A': [0, 1, 2]}, index=['a', 'a', 'b'])
df2

Unnamed: 0,A
a,0
a,1
b,2


In [None]:
# Select row with label 'b'
# Returns a Series
df2.loc['b']

A    2
Name: b, dtype: int64

In [None]:
# Select rows with label 'a'
# Returns a DataFrame
df2.loc['a']

Unnamed: 0,A
a,0
a,1


# Duplicate Label Detection

In [11]:
# Check whether DataFrame index labels are unique
df2.index.is_unique

False

In [13]:
# Check whether DataFrame column labels are unique
df2.columns.is_unique

True

In [14]:
# Check whether an index label is duplicated
df2.index.duplicated()

array([False,  True, False])

In [15]:
# Drop duplicate rows from DataFrame
df2.loc[~df2.index.duplicated(), :]

Unnamed: 0,A
a,0
b,2


In [16]:
# Resolve duplicates by averaging rows with the same index label
df2.groupby(level=0).mean()

Unnamed: 0,A
a,0.5
b,2.0


# Disallowing Duplicate Labels

In [17]:
# Attempt to create a Series with duplicate index labels
# Set flags to disallow duplicate index labels
try:
    pd.Series([0, 1, 2], index=['a', 'a', 'b']).set_flags(allows_duplicate_labels=False)
except ValueError as e:
    print("ValueError:", e)

ValueError: Index has duplicates.
      positions
label          
a        [0, 1]


In [18]:
# Create DataFrame without duplicate index or column labels
# Set flags to disallow duplicate index or column labels
df = pd.DataFrame({'A': [0, 1, 2, 3]}, index=['x', 'y', 'X', 'Y']).set_flags(allows_duplicate_labels=False)
df

Unnamed: 0,A
x,0
y,1
X,2
Y,3


In [19]:
# Check whether DataFrame allows duplicate labels
df.flags.allows_duplicate_labels

False

In [21]:
# Create a copy of DataFrame where duplicate labels are allowed
df2 = df.set_flags(allows_duplicate_labels=True)
df2.flags.allows_duplicate_labels

True

In [22]:
# Attempt to perform an operation that creates duplicate labels on a DataFrame
# where duplicate labels are disallowed
try:
    df.rename(str.upper)
except ValueError as e:
    print("ValueError:", e)

ValueError: Index has duplicates.
      positions
label          
X        [0, 2]
Y        [1, 3]
