# Chapter 5 - Getting Started with `pandas`

## 5.1 Introduction to `pandas` Data Structures

In [1]:
# Import the library with its alias
import pandas as pd

# Alternatively, import the 2 key data structures
from pandas import Series, DataFrame
pd.options.display.max_columns = None

import numpy as np

### `Series`
A `Series` is a <u>one-dimensional</u> object containing a sequence of values and an <u>associated array of data labels</u>, called its **index**. In the representation, the index is on the left and the elements are on the right.

In [2]:
######################
# NOTES: From here on, use the more strict import, which is Series here.
######################

o = Series([1, 3, 4, 7, 11])
display(o)
# Use Series.index and Series.values to get them respectively
print(o.index)
print(o.values)

0     1
1     3
2     4
3     7
4    11
dtype: int64

RangeIndex(start=0, stop=5, step=1)
[ 1  3  4  7 11]


In [3]:
# Declaring a Series with its associated index
p = Series([1, 3, 4, 7, 11], index=['a', 'b', 'c', 'd', 'e'])

display(p)
print(p.index)
print(p.values)

a     1
b     3
c     4
d     7
e    11
dtype: int64

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[ 1  3  4  7 11]


In [4]:
q = Series([2, 4, 12,], index=['c', 'b', 'a'])
display(q)

# Selecting a value from a Series using indices
print(q['a'])

# Updating a value from a Series using indices
q['a'] = 1
display(q)

# Selecting multiple values from a Series (Pay attention to the double square brackets)
q[['c', 'a']]

c     2
b     4
a    12
dtype: int64

12


c    2
b    4
a    1
dtype: int64

c    2
a    1
dtype: int64

In [5]:

p = Series([1, 3, 4, 7, 11], index=['a', 'b', 'c', 'd', 'e'])
display(p)
# Indices are preserved after performing operations (arithmetic operations)
display(p**3)

q = Series([4000, 12000, 1600, 21000, 44000], index=['f', 'g', 'h', 'i', 'j'])
display(q)
# Indices are preserved after performing functions
display(np.log(q))

a     1
b     3
c     4
d     7
e    11
dtype: int64

a       1
b      27
c      64
d     343
e    1331
dtype: int64

f     4000
g    12000
h     1600
i    21000
j    44000
dtype: int64

f     8.294050
g     9.392662
h     7.377759
i     9.952278
j    10.691945
dtype: float64

In [6]:
g = Series([8.2,  9.3,  7.3,  9.9,  10.6,], index=['ak', 'bl', 'cm', 'dn', 'eo'])
display(g)
# Indices are preserved also after filtering
display(g[g>9])

ak     8.2
bl     9.3
cm     7.3
dn     9.9
eo    10.6
dtype: float64

bl     9.3
dn     9.9
eo    10.6
dtype: float64

In [7]:
# A Series can be created from a dictionary
g = Series({2000 : 100, 2001 : 200, 2002 : 400})
display(g)

# A Series can also be thought of as a dictionary, a mapping from index to values (like keys to values)
print(2000 in g)
print(2003 in g)

2000    100
2001    200
2002    400
dtype: int64

True
False


In [8]:
g = Series({2010 : 175, 2011 : 275, 2012 : 400, 2013 : None})
display(g)

# Use pd.isnull(s) or Series.isnull() to find which rows are null in the Series
display(pd.isnull(g))
display(g.isnull())
# Use pd.notnull(s) or Series.notnull() to get the rows that are filled.
display(pd.notnull(g))
display(g.notnull())

2010    175.0
2011    275.0
2012    400.0
2013      NaN
dtype: float64

2010    False
2011    False
2012    False
2013     True
dtype: bool

2010    False
2011    False
2012    False
2013     True
dtype: bool

2010     True
2011     True
2012     True
2013    False
dtype: bool

2010     True
2011     True
2012     True
2013    False
dtype: bool

<hr>
### `DataFrame`

A `DataFrame` is a rectangular table of data with an ordered collection of columns, often of different datatypes. It has <u>both</u> a row and column index.

In [9]:
df = pd.read_csv('dataset-A-loans.csv', index_col=0)
display(df)

Unnamed: 0,loan_amnt,int_rate,term,grade
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


In [10]:
# Retrieving a column from a df as a Series. Note that the row index is preserved.
display(df['int_rate'])
# Retrieve a column using the attribute of the Series. This is only valid when the
# column name is a valid Python variable
display(df.grade)

48304290     8.18
49904421    13.33
32038416    20.20
11456303     8.39
23613274    12.49
55949701    16.99
Name: int_rate, dtype: float64

48304290    B
49904421    C
32038416    E
11456303    A
23613274    B
55949701    D
Name: grade, dtype: object

In [11]:
# Rows can be retrieved via position using iloc[position]
display(df.iloc[1])

# Row can also be retrieved via index name using loc[index]
display(df.loc[32038416])

loan_amnt         14225
int_rate          13.33
term          60 months
grade                 C
Name: 49904421, dtype: object

loan_amnt         12000
int_rate           20.2
term          60 months
grade                 E
Name: 32038416, dtype: object

In [12]:
# Assigning values to a row in a df (dynamically)
df['id'] = range(1, df.shape[0]+1)
# Assigning values to a row in a df with a scalar value
df['disburse_date'] = '1-Jan-2019'
display(df)

Unnamed: 0,loan_amnt,int_rate,term,grade,id,disburse_date
48304290,30000.0,8.18,36 months,B,1,1-Jan-2019
49904421,14225.0,13.33,60 months,C,2,1-Jan-2019
32038416,12000.0,20.2,60 months,E,3,1-Jan-2019
11456303,18000.0,8.39,36 months,A,4,1-Jan-2019
23613274,4000.0,12.49,36 months,B,5,1-Jan-2019
55949701,15000.0,16.99,60 months,D,6,1-Jan-2019


In [13]:
# Transpose a df using df.T
display(df.T)

Unnamed: 0,48304290,49904421,32038416,11456303,23613274,55949701
loan_amnt,30000,14225,12000,18000,4000,15000
int_rate,8.18,13.33,20.2,8.39,12.49,16.99
term,36 months,60 months,60 months,36 months,36 months,60 months
grade,B,C,E,A,B,D
id,1,2,3,4,5,6
disburse_date,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019


In [14]:
# Transpose a df using df.transpose()
display(df.transpose())

Unnamed: 0,48304290,49904421,32038416,11456303,23613274,55949701
loan_amnt,30000,14225,12000,18000,4000,15000
int_rate,8.18,13.33,20.2,8.39,12.49,16.99
term,36 months,60 months,60 months,36 months,36 months,60 months
grade,B,C,E,A,B,D
id,1,2,3,4,5,6
disburse_date,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019,1-Jan-2019


<hr>
### Index Objects

`pandas` uses Index objects to hold the axis labels and other metadata (like the axis name or names). Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index.

In [15]:
df1 = pd.read_csv('dataset-A1-loans.csv', index_col=0)
print(df1)
loan_ids = df1.index
display(loan_ids)

          loan_amnt  int_rate        term grade
48304290    30000.0      8.18   36 months     B
49904421    14225.0     13.33   60 months     C
49904421    12000.0     20.20   60 months     E
11456303    18000.0      8.39   36 months     A
55949701    15000.0     16.99   60 months     D


Int64Index([48304290, 49904421, 49904421, 11456303, 55949701], dtype='int64')

In [16]:
# Retrieve index using positions or ranges
print(loan_ids[0])
print(loan_ids[1:])

48304290
Int64Index([49904421, 49904421, 11456303, 55949701], dtype='int64')


In [17]:
# Index objects are immutable
# loan_ids[0] = 1 # This will raise a TypeError

# Index objects also behave like sets. However, unlike sets, Index objects can contain duplicate labels
print(49904421 in loan_ids)
print('4' in loan_ids)

True
False


<hr>

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)