# Chapter 5 - Getting Started with `pandas`

## 5.2 Essential Functionality

In [1]:
import pandas as pd
import numpy as np

### Re-Indexing 
Given a `Series`, reindexing is used to create a new object with the data re-arranged using the new index.

In [2]:
df_b2 = pd.read_csv('dataset-B2-membership.csv', index_col=0)
m_b2 = df_b2['membership']
print(m_b2)
print()
# Perform reindexing step - sort from earliest to latest year
m_b2_reindexed = m_b2.reindex(sorted(m_b2.index))
print(m_b2_reindexed)

year
2014    686676
2018    762807
2016    740750
2015    718723
2017    755217
Name: membership, dtype: int64

year
2014    686676
2015    718723
2016    740750
2017    755217
2018    762807
Name: membership, dtype: int64


In [3]:
df_b1 = pd.read_csv('dataset-B1-membership.csv', index_col=0)
m_b1 = df_b1['membership']
print(m_b1)
print()
# Series.reindex can be used to perform interpolation / filling
m_b1_reindexed = m_b1.reindex(range(2010, 2019), method='ffill') 
print(m_b1_reindexed)

year
2010    549878
2012    613418
2013    655126
2014    686676
2015    718723
2018    762807
Name: membership, dtype: int64

year
2010    549878
2011    549878
2012    613418
2013    655126
2014    686676
2015    718723
2016    718723
2017    718723
2018    762807
Name: membership, dtype: int64


Note that `reindex` has other key arguments like `index`, `method`, `fill_value`, `limit`, `tolerance`, `level`, and `copy`.

<hr>
### Dropping Entries from an Axis
`drop` will return a new object with the selected values deleted from an axis.

In [4]:
df = pd.read_csv('dataset-B-membership.csv', index_col=0)
memberships = df.membership
print(memberships)
print()
# Series.drop() step to remove some records
memberships_d = memberships.drop(range(2009, 2014))
memberships_d

year
2009    526089
2010    549878
2011    588014
2012    613418
2013    655126
2014    686676
2015    718723
2016    740750
2017    755217
2018    762807
Name: membership, dtype: int64



year
2014    686676
2015    718723
2016    740750
2017    755217
2018    762807
Name: membership, dtype: int64

In [5]:
df2 = df.copy()
display(df2)
# Drop a selected set of rows
display(df2.drop(range(2009, 2016)))

Unnamed: 0_level_0,membership
year,Unnamed: 1_level_1
2009,526089
2010,549878
2011,588014
2012,613418
2013,655126
2014,686676
2015,718723
2016,740750
2017,755217
2018,762807


Unnamed: 0_level_0,membership
year,Unnamed: 1_level_1
2016,740750
2017,755217
2018,762807


In [6]:
loans_df = pd.read_csv('dataset-A-loans.csv', index_col=0)
display(loans_df)
# Use axis=1 to drop a column
display(loans_df.drop(['int_rate', 'term'], axis=1))

Unnamed: 0,loan_amnt,int_rate,term,grade
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


Unnamed: 0,loan_amnt,grade
48304290,30000.0,B
49904421,14225.0,C
32038416,12000.0,E
11456303,18000.0,A
23613274,4000.0,B
55949701,15000.0,D


In [7]:
display(loans_df)
# inplace=True will manipulate the object itself without returning a new object
loans_df.drop(['grade'], axis=1, inplace=True)
display(loans_df)

Unnamed: 0,loan_amnt,int_rate,term,grade
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


Unnamed: 0,loan_amnt,int_rate,term
48304290,30000.0,8.18,36 months
49904421,14225.0,13.33,60 months
32038416,12000.0,20.2,60 months
11456303,18000.0,8.39,36 months
23613274,4000.0,12.49,36 months
55949701,15000.0,16.99,60 months


### Indexing, Selection and Filtering

The index of the `Series` or `DataFrame` can be used for slicing.

In [8]:
df_b2 = pd.read_csv('dataset-B2-membership.csv')
# Change the datatype of year to a string and then assign it to the index
df_b2['year'] = df_b2['year'].astype(str)
df_b2.index = df_b2['year']
membership_b2 = df_b2['membership']
display(membership_b2)

# Using index to pull values
print(membership_b2['2014'])
print(membership_b2[['2014', '2015', '2016']])
# Be careful as indexing using ranges will pull based on the positions, 
# not in ascending order. For this, use reindexing
print(membership_b2['2014':'2015'])

year
2014    686676
2018    762807
2016    740750
2015    718723
2017    755217
Name: membership, dtype: int64

686676
year
2014    686676
2015    718723
2016    740750
Name: membership, dtype: int64
year
2014    686676
2018    762807
2016    740750
2015    718723
Name: membership, dtype: int64


In [9]:
# Using positions to pull values
print(membership_b2[0])
print()
print(membership_b2[[0, 2, 3]])
print()
print(membership_b2[1:2])
print()

686676

year
2014    686676
2016    740750
2015    718723
Name: membership, dtype: int64

year
2018    762807
Name: membership, dtype: int64



For `DataFrame`, indexing used to retrieve columns.

In [10]:
loans_df = pd.read_csv('dataset-A-loans.csv')
loans_df.columns = ['id', 'loan_amnt', 'int_rate', 'term', 'grade']
display(loans_df)

Unnamed: 0,id,loan_amnt,int_rate,term,grade
0,48304290,30000.0,8.18,36 months,B
1,49904421,14225.0,13.33,60 months,C
2,32038416,12000.0,20.2,60 months,E
3,11456303,18000.0,8.39,36 months,A
4,23613274,4000.0,12.49,36 months,B
5,55949701,15000.0,16.99,60 months,D


In [11]:
display(loans_df['id'])
# Be careful of the double brackets
display(loans_df[['id', 'grade']])

0    48304290
1    49904421
2    32038416
3    11456303
4    23613274
5    55949701
Name: id, dtype: int64

Unnamed: 0,id,grade
0,48304290,B
1,49904421,C
2,32038416,E
3,11456303,A
4,23613274,B
5,55949701,D


By combining the row index with the column index, `iloc` and `loc` can be used to obtain slices of a `DataFrame`.

In [12]:
# Using loc to pull a slice of a df. Note that the order matters: row then column
display(loans_df.loc[:2, ['id', 'grade']])

Unnamed: 0,id,grade
0,48304290,B
1,49904421,C
2,32038416,E


In [13]:
loans_df2 = loans_df.copy()
loans_df2.index = loans_df2['id']
loans_df2.drop('id', axis=1, inplace=True)
display(loans_df2)
# Using iloc to pull a slice of a df. Note that the order matters: row then column.
# Also, recall iloc uses positional index
loans_df2.iloc[2:3, [1,3]]

Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


Unnamed: 0_level_0,int_rate,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1
32038416,20.2,E


### Arithmetic and Data Alignment

The behaviour of arithmetic between objects with different indexes is important to note. When adding together objects, the resultant will be **the union of the index pairs**. This is similar to an "outer join" on 2 tables in databases.

In [14]:
uni_df2 = pd.read_csv('dataset-C1-enrolment.csv')
uni_df2.index = uni_df2.year
uni_df2.drop('year', axis=1, inplace=True)
uni_df2 = uni_df2[['sex', 'enrolment']]

uni_df2_mf = uni_df2[uni_df2.sex=='MF']['enrolment'] # Enrolment for both male & female
display(uni_df2_mf)
uni_df2_f = uni_df2[uni_df2.sex=='F']['enrolment'] # Enrolment for only female
display(uni_df2_f)

year
2012    1474
2013    1491
2016    1593
2017    1588
Name: enrolment, dtype: int64

year
2012    717
2013    736
2014    715
2015    706
2017    741
Name: enrolment, dtype: int64

When performing arithmetic operations on the two `Series` objects, calculations are only performed when the index **appears in both of the objects**. When the index exists in both the objects, then the operation is performed on the elements represented by the index. This is called **alignment**.

In [15]:
uni_df2_mf - uni_df2_f # to get male, take mf - f
# Since 2014 and 2015 does not appear in uni_df2_mf, the result is a NaN
# Since 2016 does not appear in uni_df2_f, the result is a NaN

year
2012    757.0
2013    755.0
2014      NaN
2015      NaN
2016      NaN
2017    847.0
Name: enrolment, dtype: float64

In the case of `DataFrame` objects, alignment is performed on both rows and columns.

In [16]:
uni_df = pd.read_csv('dataset-C-enrolment.csv')

# Q: Find the total intake, enrolment & graduates of all students in the year 2015 - 2017.
uni_df_2015 = uni_df.copy()
uni_df_2015 = uni_df_2015[uni_df_2015.year==2015]
display(uni_df_2015)

uni_df_2016 = uni_df.copy()
uni_df_2016 = uni_df_2016[uni_df_2016.year==2016]
display(uni_df_2016)

uni_df_2017 = uni_df.copy()
uni_df_2017 = uni_df_2017[uni_df_2017.year==2017]
display(uni_df_2017)

Unnamed: 0,year,sex,course,intake,enrolment,graduates
20,2015,MF,Law,405,1550,355
21,2015,F,Law,171,706,168


Unnamed: 0,year,sex,course,intake,enrolment,graduates
22,2016,MF,Law,399,1593,351
23,2016,F,Law,210,740,173


Unnamed: 0,year,sex,course,intake,enrolment,graduates
24,2017,MF,Law,391,1588,375
25,2017,F,Law,201,741,188


Notice that the index for all 3 `df`s above are <u>different</u>. Hence, adding them will result in all `NaN`. Also, all 6 index values are present.

In [17]:
display(uni_df_2015 + uni_df_2016 + uni_df_2017)

Unnamed: 0,year,sex,course,intake,enrolment,graduates
20,,,,,,
21,,,,,,
22,,,,,,
23,,,,,,
24,,,,,,
25,,,,,,


To amend this, change the index for all the `df`s before performing the addition.

In [18]:
uni_df_columns = ['sex', 'intake', 'enrolment', 'graduates']
uni_df_2015.index = range(0,2)
uni_df_2015 = uni_df_2015[uni_df_columns]
display(uni_df_2015)

uni_df_2016.index = range(0,2)
uni_df_2016 = uni_df_2016[uni_df_columns]
display(uni_df_2016)

uni_df_2017.index = range(0,2)
uni_df_2017 = uni_df_2017[uni_df_columns]
display(uni_df_2017)

Unnamed: 0,sex,intake,enrolment,graduates
0,MF,405,1550,355
1,F,171,706,168


Unnamed: 0,sex,intake,enrolment,graduates
0,MF,399,1593,351
1,F,210,740,173


Unnamed: 0,sex,intake,enrolment,graduates
0,MF,391,1588,375
1,F,201,741,188


In [19]:
# Note that addition on strings is the same as performing string concatenation
uni_df_2015 + uni_df_2016 + uni_df_2017
# This is the total intake & graduates for the period 2015 - 2017.

Unnamed: 0,sex,intake,enrolment,graduates
0,MFMFMF,1195,4731,1081
1,FFF,582,2187,529


In [20]:
# Since one of the cells is NaN, then in the result, that cell is also NaN
print(uni_df_2015)
uni_df_2016.iloc[0,2] = None
print(uni_df_2016)
print()
print(uni_df_2015 + uni_df_2016)

  sex  intake  enrolment  graduates
0  MF     405       1550        355
1   F     171        706        168
  sex  intake  enrolment  graduates
0  MF     399        NaN        351
1   F     210      740.0        173

    sex  intake  enrolment  graduates
0  MFMF     804        NaN        706
1    FF     381     1446.0        341


In [21]:
# To make that value be filled as zero, use np.add()
# In this case, for all values in the objects that are NaN, they will be filled with 0
# BEFORE the addition.
uni_df_2015.add(uni_df_2016, fill_value=0)

Unnamed: 0,sex,intake,enrolment,graduates
0,MFMF,804,1550.0,706
1,FF,381,1446.0,341


When performing arithmetics on `DataFrame` and `Series`, the broadcasting is performed down the rows, matching the index of the `Series` with the column name of the `DataFrame`.

In [22]:
uni_costs_df = uni_df.copy()
uni_costs_df = uni_costs_df[(uni_costs_df.year>=2014) & (uni_df.sex=='MF')]
uni_costs_df.index = uni_costs_df.year
uni_costs_df = uni_costs_df[['enrolment', 'graduates']]
display(uni_costs_df)

# Broadcasting across each row, matching the index of the Series with the column name of the DataFrames
school_fees_series = pd.Series([100, 5500], index=['enrolment', 'graduates'])
print(school_fees_series)
uni_costs_df * school_fees_series

Unnamed: 0_level_0,enrolment,graduates
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,1514,356
2015,1550,355
2016,1593,351
2017,1588,375


enrolment     100
graduates    5500
dtype: int64


Unnamed: 0_level_0,enrolment,graduates
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,151400,1958000
2015,155000,1952500
2016,159300,1930500
2017,158800,2062500


In [23]:
# Since df cols does not have 'starting_salary' then in the result, there is an extra column with NaN.b
employment_series = pd.Series([100, 5500, 4000], index=['enrolment', 'graduates', 'starting_salary'])
print(employment_series)
uni_costs_df * employment_series

enrolment           100
graduates          5500
starting_salary    4000
dtype: int64


Unnamed: 0_level_0,enrolment,graduates,starting_salary
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,151400,1958000,
2015,155000,1952500,
2016,159300,1930500,
2017,158800,2062500,


### Function Application and Mapping

`numpy` ufuncs can be used on `Series` and `DataFrame` too.

In [24]:
enrolment_df = uni_df.copy()
enrolment_df = enrolment_df[enrolment_df.sex=='MF']
enrolment_df = enrolment_df[['intake', 'graduates']]
# Using np.log() on a Series will apply the function on each element
# in the Series
np.log(enrolment_df['intake'])

0     5.393628
2     5.472271
4     5.805135
6     5.874931
8     5.948035
10    5.857933
12    5.897154
14    5.924256
16    5.998937
18    5.983936
20    6.003887
22    5.988961
24    5.968708
Name: intake, dtype: float64

In [25]:
display(enrolment_df)
fees = lambda x : x*5000
display(enrolment_df.apply(fees))

Unnamed: 0,intake,graduates
0,220,187
2,238,204
4,332,207
6,356,209
8,383,207
10,350,227
12,364,329
14,374,347
16,403,368
18,397,356


Unnamed: 0,intake,graduates
0,1100000,935000
2,1190000,1020000
4,1660000,1035000
6,1780000,1045000
8,1915000,1035000
10,1750000,1135000
12,1820000,1645000
14,1870000,1735000
16,2015000,1840000
18,1985000,1780000


### Sorting and Ranking

In [26]:
# Use sort_index() to sort by index, lowest to highest
display(loans_df2)
display(loans_df2.sort_index())

# Use ascending=False to sort in the opposite order
display(loans_df2.sort_index(ascending=False))

Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
32038416,12000.0,20.2,60 months,E
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
55949701,15000.0,16.99,60 months,D


Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
55949701,15000.0,16.99,60 months,D
49904421,14225.0,13.33,60 months,C
48304290,30000.0,8.18,36 months,B
32038416,12000.0,20.2,60 months,E
23613274,4000.0,12.49,36 months,B
11456303,18000.0,8.39,36 months,A


In [27]:
# Use sort_values to sort by a particular column
display(loans_df2.sort_values('int_rate'))
# Note: missing values like NaN will fall to the end

# Use ascending=False to sort in the opposite order.
display(loans_df2.sort_values('int_rate', ascending=False))

# To sort on multiple columns, use by=['col1', 'col2']
display(loans_df2.sort_values(by=['grade', 'int_rate']))
 
# Remove the index
loans_df2.index.name = None
display(loans_df2)

Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48304290,30000.0,8.18,36 months,B
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
49904421,14225.0,13.33,60 months,C
55949701,15000.0,16.99,60 months,D
32038416,12000.0,20.2,60 months,E


Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32038416,12000.0,20.2,60 months,E
55949701,15000.0,16.99,60 months,D
49904421,14225.0,13.33,60 months,C
23613274,4000.0,12.49,36 months,B
11456303,18000.0,8.39,36 months,A
48304290,30000.0,8.18,36 months,B


Unnamed: 0_level_0,loan_amnt,int_rate,term,grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11456303,18000.0,8.39,36 months,A
48304290,30000.0,8.18,36 months,B
23613274,4000.0,12.49,36 months,B
49904421,14225.0,13.33,60 months,C
55949701,15000.0,16.99,60 months,D
32038416,12000.0,20.2,60 months,E


Unnamed: 0,loan_amnt,int_rate,term,grade
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


In [28]:
l = loans_df2['loan_amnt'].tolist()
print(l)
print()

[30000.0, 14225.0, 12000.0, 18000.0, 4000.0, 15000.0]



In [29]:
loanamt_2 = loans_df2['loan_amnt']
print(loanamt_2)
# Using Series.rank() will assign ranks from smallest
# to largest. Note that rank() has different methods for tie breaking
print(loanamt_2.rank())

48304290    30000.0
49904421    14225.0
32038416    12000.0
11456303    18000.0
23613274     4000.0
55949701    15000.0
Name: loan_amnt, dtype: float64
48304290    6.0
49904421    3.0
32038416    2.0
11456303    5.0
23613274    1.0
55949701    4.0
Name: loan_amnt, dtype: float64


<hr>

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)

**Additional References**

- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.reindex.html