# What do I need to know about the pandas index?

In [18]:
import pandas as pd

In [19]:
url = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(url)

In [20]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [21]:
# All DataFrame and Series have index. When not explicitly provided by the user, a default integer range will be used.
drinks.index # index is sometimes called row label

RangeIndex(start=0, stop=193, step=1)

In [22]:
# Neither the index nor the columns are part of the content they don't compute in the dimensionality of the DataFrame.
drinks.shape # There are n rows plus the column row and n columns plus the index column

(193, 6)

### Index is useful for identification

In [23]:
# When filtering by continent column we can identificate the original position of each row thanks to the index.
drinks.loc[drinks.continent=='Europe'].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
1,Albania,89,132,54,4.9,Europe
3,Andorra,245,138,312,12.4,Europe
7,Armenia,21,179,11,3.8,Europe
9,Austria,279,75,191,9.7,Europe
10,Azerbaijan,21,46,5,1.3,Europe


### Index is useful for selection

In [24]:
# We can acces a row value thanks to its index
drinks.loc[3, 'wine_servings']

312

### We can change the index

In [25]:
# We can set the DataFrame index (row labels) using one or more existing columns. By default yields a new object.
drinks.set_index('country', inplace=True) # making changes in place
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [28]:
# Now we can access the same value as before without having to memorize the index number
drinks.loc['Andorra', 'wine_servings']

312

In [27]:
# Dropping the index name
drinks.index.name = None
drinks.head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [30]:
# UNDOING THE INDEX CHANGE
drinks.index.name = 'country' # we give the original name back so it will be interpret as the column name
drinks.reset_index(inplace=True) # reseting index and making changes in place
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [31]:
# Whenever a DataFrame or a Series is returned, we use its columns names and index
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [32]:
# We can access a row by its index
drinks.describe().loc['25%']

beer_servings                   20.0
spirit_servings                  4.0
wine_servings                    1.0
total_litres_of_pure_alcohol     1.3
Name: 25%, dtype: float64