In [2]:
import warnings
warnings.simplefilter('ignore')

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Table of Contents
>## 1. `Series` class
* 1.1. Generate `Series`
* 1.2. `Series` - Update
* 1.3. `Series` - Indexing
* 1.4. `Series` - Slicing
* 1.5. `Series` - Treat as `Dictionary`
* 1.6. `Series` - Operations

>## 2. `DataFrame` class
* 2.1. Generate `DataFrame`
* 2.2. `DataFrame` - Indexing
* 2.3. `DataFrame` - Update
* 2.4. `DataFrame` - Indexer

>## 3. `Series` and `DataFrame` - Operations
* 3.1. Count
* 3.2. Count Categorical Values
* 3.3. Sort
* 3.4. Sum
* 3.5. `apply`
* 3.6. Real values $\rightarrow$ Categorical values

>## 4. `index` - Operations
* 4.1. `set_index` and `reset_index`
* 4.2. Multi-index
* 4.3. Column index $\leftrightarrow$ Row index
* 4.4. Indexing Multi-index
* 4.5. swaplevel
* 4.6. sort_index

>## 5. Combine DataFrames
* 5.1. `merge`
* 5.2. `join`
* 5.3. `concat`

>## 6. Pivot Table & Group Analysis
* 6.1. `pivot`
* 6.2. `groupby`
* 6.3. `pivot_table` 

>## 7. Time Series
* 7.1. `DatetimeIndex`
* 7.2. `shift`
* 7.3. `resample`

>## 8. Read & Write Files
* 8.1. Supported File Formats
* 8.2. Read CSV
* 8.3. Read Text
* 8.4. Write CSV or Text

## 1. `Series` class
* Series = value + index

### 1.1. Generate `Series`
* `pd.Series()` - generates series
* `s.index` & `s.values` - return values
* `s.index.name` & `s.name` - set names
* `index`
  * default = 0, 1, ...
  * it can be string, datetime, integer, ...
  * its length should match the length of data

In [32]:
s1 = pd.Series([9904312, 3448737, 2890451, 2466052], index=["Seoul", "Busan", "Incheon", "Daegu"])
print(s1, s1.index, s1.values, sep='\n\n')

Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
dtype: int64

Index(['Seoul', 'Busan', 'Incheon', 'Daegu'], dtype='object')

[9904312 3448737 2890451 2466052]


In [33]:
s1.name = "Population"
s1.index.name = "City"
s1

City
Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.2. `Series` - Update

In [52]:
s1["Busan"] = 1.63
s1

City
Seoul      9904312
Busan            1
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

In [54]:
del s1['Busan']
s1

City
Seoul      9904312
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.3. `Series` - Indexing

In [35]:
s1[1]
s1["Busan"]
s1.Busan #only when the index label is English

3448737

In [36]:
s1[[0, 3, 1]]
s1[['Seoul', 'Daegu', 'Busan']]

City
Seoul    9904312
Daegu    2466052
Busan    3448737
Name: Population, dtype: int64

In [37]:
s1[(250e4 < s1) & (s1 < 500e4)]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

### 1.4. `Series` - Slicing
* Slicing with **integer** label $\rightarrow$ the latter is **excluded**
* Slicing with **string** label $\rightarrow$ the latter is **included**

In [38]:
s1[1:3]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

In [39]:
s1["Busan":"Daegu"]

City
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.5. `Series` - Treat as `Dictionary`

시리즈 객체는 라벨 값에 의해 인덱싱이 가능하므로 실질적으로 라벨 값을 키(key)로 가지는 딕셔너리 자료형과 같다고 볼 수 있다. 따라서 딕셔너리 자료형에서 제공하는 ``in`` 연산도 가능하고 ``items`` 메서드를 사용하면 ``for`` 루프를 통해 각 원소의 키(key)와 값(value)을 접근할 수도 있다.

In [40]:
"Seoul" in s1

True

In [41]:
for k, v in s1.items():
    print("%s = %d" % (k, v))

Seoul = 9904312
Busan = 3448737
Incheon = 2890451
Daegu = 2466052


#### Generate `Series` from `Dictionary`

In [42]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158})
s2 #dictionary object does not have order - to specify order, specify index

Busan      3393191
Daejeon    1490158
Incheon    2632035
Seoul      9631482
dtype: int64

In [43]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158}, 
               index=["Busan", "Seoul", "Incheon", "Daejeon"])
s2

Busan      3393191
Seoul      9631482
Incheon    2632035
Daejeon    1490158
dtype: int64

### 1.6. `Series` - Operations
* Operations are applied only to the **values** (**index** is invariant)

In [34]:
s1 / 1000000

City
Seoul      9.904312
Busan      3.448737
Incheon    2.890451
Daegu      2.466052
Name: Population, dtype: float64

#### Operation between multiple `Series` $\rightarrow$ Operation applied only to the data with the same index

In [48]:
s1.values - s2.values #not something we need

array([ 6511121, -6182745,   258416,   975894])

In [47]:
ds = s1 - s2
ds

Busan       55546.0
Daegu           NaN
Daejeon         NaN
Incheon    258416.0
Seoul      272830.0
dtype: float64

In [49]:
ds.notnull()

Busan       True
Daegu      False
Daejeon    False
Incheon     True
Seoul       True
dtype: bool

In [50]:
ds[ds.notnull()]

Busan       55546.0
Incheon    258416.0
Seoul      272830.0
dtype: float64

## 2. `DataFrame` class

### 2.1. Generate  `DataFrame`

In [4]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}

columns = ["지역", "2015", "2010", "2005", "2000", "2010-2015 증가율"]

index = ["서울", "부산", "인천", "대구"]

df = pd.DataFrame(data, index=index, columns=columns)

df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Access data

In [5]:
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [6]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [7]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

#### Set names

In [8]:
df.index.name = "도시"
df.columns.name = "특성"
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Transpose

In [9]:
df.T

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


### 2.2. `DataFrame` - Indexing

In [10]:
df["지역"] #Series

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

#### To preserve `DataFrame` datatype, use list

In [12]:
df[["지역"]] #DataFrame

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [11]:
df[["2010", "2015"]]

특성,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


#### Integer Indexing - cannot be used when column index has string label

In [13]:
df[0]

KeyError: 0

#### Integer Indexing - can be used when column index has integer label

In [16]:
df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
df2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [17]:
df2[2]

0     2
1     6
2    10
Name: 2, dtype: int64

#### Single Data Indexing - indexing with column label & indexing with row label

In [21]:
df["2015"]["서울"]

9904312

#### Row Indexing - use slicing

In [22]:
df[:1]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [51]:
df[1:3]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54


In [52]:
df["서울":"부산"]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4


### 2.3. `DataFrame` - Update

In [18]:
df["2010-2015 증가율"] = df["2010-2015 증가율"] * 100
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2431774,2456016,2473990,1.41


In [19]:
df["2005-2010 증가율"] = ((df["2010"] - df["2005"]) / df["2005"] * 100).round(2)
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83,-1.34
부산,경상권,3448737,3393191,3512547,3655437,1.63,-3.4
인천,수도권,2890451,2632035,2517680,2466338,9.82,4.54
대구,경상권,2466052,2431774,2456016,2473990,1.41,-0.99


In [20]:
del df["2010-2015 증가율"]
df

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


### 2.4. `DataFrame` - Indexer
* Enables 2D indexing (row index, column index)
* Indexers:
  * `loc` : Label-based 2D indexing
  * `iloc` : Order-2D integer indexing
  * `at`: Label-based 2D indexing (single scalar value)
  * `iat` : Order-based 2D integer indexing (single scalar value)

#### `loc` indexer

In [55]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                  index=["a", "b", "c"],
                  columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [58]:
df.loc["a", "A"]

10

In [7]:
df.loc["a", :] #extract one row as Series
#df[:1] returns DataFrame

A    10
B    11
C    12
D    13
Name: a, dtype: int64

In [3]:
df.loc["b":, "A"] #slicing

b    14
c    18
Name: A, dtype: int64

In [5]:
df.loc[["a", "b"], ["B", "D"]]

Unnamed: 0,B,D
a,11,13
b,15,17


In [6]:
df.loc[df.A > 10, :]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [60]:
def select_rows(df): #use function
    return df.A > 12

select_rows(df)

a    False
b     True
c     True
Name: A, dtype: bool

In [61]:
df.loc[select_rows(df), ["B"]]

Unnamed: 0,B
b,15
c,19


In [62]:
# df.loc[:, df[:1] <= 11]  # Wrong
df.loc[:, df.loc["a", :] <= 11] # Right

Unnamed: 0,A,B
a,10,11
b,14,15
c,18,19


In [14]:
df.loc["e"] = [90, 91, 92, 93]
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,90,91,92,93


#### `loc` indexer - when indices are integers
* In such case, the indices are treated as order-less

In [63]:
df2 = pd.DataFrame(np.arange(10, 26).reshape(4, 4), columns=np.arange(1, 8, 2))
df2

Unnamed: 0,1,3,5,7
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [64]:
df2.loc[1, 1]

14

In [65]:
df2.loc[1:2, :]

Unnamed: 0,1,3,5,7
1,14,15,16,17
2,18,19,20,21


#### `iloc` indexer

In [18]:
df.iloc[0, 1]

11

In [19]:
df.iloc[:2, 2]

a    12
b    16
Name: C, dtype: int64

In [20]:
df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int64

In [21]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [22]:
df.iloc[-1]

A    90
B    91
C    92
D    93
Name: e, dtype: int64

In [23]:
df.iloc[-1] = df.iloc[-1] * 2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,180,182,184,186


#### `at` & `iat` indexer
* Used to extract single scalar value
* Faster than `loc` & `iloc`

In [66]:
%timeit df.loc["a", "A"]

5.78 µs ± 329 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [67]:
%timeit df.at["a", "A"]

4.36 µs ± 435 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [68]:
%timeit df.iloc[0, 0]

7.52 µs ± 336 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [69]:
%timeit df.iat[0, 0]

4.76 µs ± 314 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 3. `Series` and `DataFrame` - Operations

### 3.1. Count
* Disregard NaN

#### Series

In [1]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [2]:
s.count()

9

#### DataFrame

In [3]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4, 4)), dtype=float)
df.iloc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [4]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

### 3.2. Count Categorical Values
* Use for integer, string, categorical values

#### Series - use `value_counts`

In [5]:
np.random.seed(1)
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    4
96    5
97    2
98    4
99    3
dtype: int64

In [6]:
s2.value_counts()

1    22
0    18
4    17
5    16
3    14
2    13
dtype: int64

#### DataFrame - use `value_count` for each column

In [7]:
df[0].value_counts()

3.0    2
4.0    1
0.0    1
Name: 0, dtype: int64

### 3.3. Sort
* `sort_index` - sort by index
* `sort_values` - sort by values
* In both cases `NaN` values goes to the end

In [8]:
s2.value_counts().sort_index()

0    18
1    22
2    13
3    14
4    17
5    16
dtype: int64

In [9]:
s.sort_values()

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

#### `ascending=False` to sort in opposite direction

In [10]:
s.sort_values(ascending=False)

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

#### `sort_values` - specify the column by which the data should be sorted

In [11]:
df.sort_values(by=1)

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


#### `by` parameter can be list - priority of sort

In [12]:
df.sort_values(by=[1, 2])

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


### 3.4. Sum
* Use `sum(axis)`
  * `axis=1` for row sum
  * `axis=0` for column sum (default)

In [13]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10, size=(4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [14]:
df2.sum(axis=1)

0    35
1    34
2    41
3    42
dtype: int64

In [15]:
df2["RowSum"] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42


In [16]:
df2.sum()

0          24
1          33
2          25
3          24
4          15
5          10
6           5
7          16
RowSum    152
dtype: int64

In [17]:
df2.loc["ColTotal", :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5.0,8.0,9.0,5.0,0.0,0.0,1.0,7.0,35.0
1,6.0,9.0,2.0,4.0,5.0,2.0,4.0,2.0,34.0
2,4.0,7.0,7.0,9.0,1.0,7.0,0.0,6.0,41.0
3,9.0,9.0,7.0,6.0,9.0,1.0,0.0,1.0,42.0
ColTotal,24.0,33.0,25.0,24.0,15.0,10.0,5.0,16.0,152.0


### 3.5. `apply`

In [3]:
df3 = pd.DataFrame({
        'A': [1, 3, 4, 3, 4],
        'B': [2, 3, 1, 2, 3],
        'C': [1, 5, 2, 4, 4]
    })
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


#### use `lambda` function

In [19]:
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

#### set `axis=1` to apply to rows

In [20]:
df3.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

#### use `value_counts`

In [21]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


#### use `fillna` to fill NaN values & use `astype` to change data type

In [22]:
df3.apply(pd.value_counts).fillna(0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


### 3.6. Real values $\rightarrow$ Categorical values
#### `cut`: set threshold
  * `bins` parameter can be:
    * **int** : Defines the number of equal-width bins in the range of x 
      * The range of x is extended by .1% on each side to include the minimum and maximum values of x
    * **sequence of scalars** : Defines the bin edges allowing for non-uniform width
      * No extension of the range of x is done.
    * **IntervalIndex** : Defines the exact bins to be used.

In [5]:
ages = [0, 2, 10, 21, 23, 37, 31, 61, 20, 41, 32, 100]

In [7]:
bins = [1, 30, 60, 99]
labels = ["Young", "Middle-Age", "Old"]
cats = pd.cut(ages, bins, labels=labels)
cats

[NaN, Young, Young, Young, Young, ..., Old, Young, Middle-Age, Middle-Age, NaN]
Length: 12
Categories (3, object): [Young < Middle-Age < Old]

#### `cut` method returns `Cetagorical` class object

In [12]:
type(cats)

pandas.core.categorical.Categorical

In [13]:
cats.categories

Index(['Young', 'Middle-Age', 'Old'], dtype='object')

In [14]:
cats.codes

array([-1,  0,  0,  0,  0,  1,  1,  2,  0,  1,  1, -1], dtype=int8)

In [15]:
df4 = pd.DataFrame(ages, columns=["ages"])
df4["age_cat"] = pd.cut(df4.ages, bins, labels=labels)
df4

Unnamed: 0,ages,age_cat
0,0,
1,2,Young
2,10,Young
3,21,Young
4,23,Young
5,37,Middle-Age
6,31,Middle-Age
7,61,Old
8,20,Young
9,41,Middle-Age


#### `qcut`: discretize variable into equal-sized buckets

In [16]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4, labels=["Q1", "Q2", "Q3", "Q4"])
cats

[Q3, Q2, Q1, Q1, Q3, ..., Q4, Q2, Q2, Q2, Q2]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [17]:
pd.value_counts(cats)

Q4    250
Q3    250
Q2    250
Q1    250
dtype: int64

## 4. `index` - Operations

### 4.1. `set_index` and `reset_index`

#### `set_index`
* delete original index
* one of columns $\rightarrow$ index

In [19]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'), 
                              np.round(np.random.rand(3, 5), 2)]).T,
                  columns=["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [20]:
df2 = df1.set_index("C1")
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


#### `reset_index`
* original index $\rightarrow$ first column
* `drop=True` to drop the original index

In [21]:
df2.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [5]:
df2.reset_index(drop=True)

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07


### 4.2. Multi-index
#### Column index: assign "list of lists" to `columns`

In [22]:
np.random.seed(0)
df3 = pd.DataFrame(np.round(np.random.randn(5, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C1", "C2", "C1", "C2"]])
df3

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### use `names` attribute of `columns` object to set names

In [23]:
df3.columns.names = ["Cidx1", "Cidx2"]
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### Row index: assign "list of lists" to `index`
* use `names` attribute of `index` object to set names

In [24]:
np.random.seed(0)
df4 = pd.DataFrame(np.round(np.random.randn(6, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C", "D", "C", "D"]],
                   index=[["M", "M", "M", "F", "F", "F"],
                          ["id_" + str(i + 1) for i in range(3)] * 2])
df4.columns.names = ["Cidx1", "Cidx2"]
df4.index.names = ["Ridx1", "Ridx2"]
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


### 4.3. Column index $\leftrightarrow$ Row index
* `stack()`: Column index $\rightarrow$ Row index
* `unstack()`: Row index $\rightarrow$ COlumn index

In [25]:
df4.stack("Cidx1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx2,C,D
Ridx1,Ridx2,Cidx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,A,1.76,0.4
M,id_1,B,0.98,2.24
M,id_2,A,1.87,-0.98
M,id_2,B,0.95,-0.15
M,id_3,A,-0.1,0.41
M,id_3,B,0.14,1.45
F,id_1,A,0.76,0.12
F,id_1,B,0.44,0.33
F,id_2,A,1.49,-0.21
F,id_2,B,0.31,-0.85


In [27]:
df4.unstack("Ridx2")

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C,C,C,D,D,D,C,C,C,D,D,D
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,0.76,1.49,-2.55,0.12,-0.21,0.65,0.44,0.31,0.86,0.33,-0.85,-0.74
M,1.76,1.87,-0.1,0.4,-0.98,0.41,0.98,0.95,0.14,2.24,-0.15,1.45


### 4.4. Indexing Multi-index
* Use `tuple`

In [29]:
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [30]:
df3[("B", "C1")]

0    0.98
1    0.95
2    0.14
3    0.44
4    0.31
Name: (B, C1), dtype: float64

In [31]:
df3.loc[0, ("B", "C1")]

0.98

In [32]:
df3.loc[0, ("B", "C1")] = 100
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,100.0,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### `iloc` indexer $\rightarrow$ tuple multi-indexing is unavailable

In [17]:
df3.iloc[0, 2]

100.0

In [19]:
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [20]:
df4.loc[("M", "id_1"), ("A", "C")]

1.76

In [21]:
df4.loc[:, ("A", "C")]

Ridx1  Ridx2
M      id_1     1.76
       id_2     1.87
       id_3    -0.10
F      id_1     0.76
       id_2     1.49
       id_3    -2.55
Name: (A, C), dtype: float64

In [22]:
df4.loc[("M", "id_1"), :]

Cidx1  Cidx2
A      C        1.76
       D        0.40
B      C        0.98
       D        2.24
Name: (M, id_1), dtype: float64

In [23]:
df4.loc[("All", "All"), :] = df4.sum()
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


### 4.5. `swaplevel`

* `swaplevel(i, j, axis)`
  * swap `i`th & `j`th index
  * `axis=0` for row index (default)
  * `axis=1` for column index

In [33]:
df5 = df4.swaplevel("Ridx1", "Ridx2")
df5

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,1.76,0.4,0.98,2.24
id_2,M,1.87,-0.98,0.95,-0.15
id_3,M,-0.1,0.41,0.14,1.45
id_1,F,0.76,0.12,0.44,0.33
id_2,F,1.49,-0.21,0.31,-0.85
id_3,F,-2.55,0.65,0.86,-0.74


In [34]:
df6 = df4.swaplevel("Cidx1", "Cidx2", 1)
df6

Unnamed: 0_level_0,Cidx2,C,D,C,D
Unnamed: 0_level_1,Cidx1,A,A,B,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


### 4.6. `sort_index`
* set `axis` & `level`

In [35]:
df5.sort_index(level=0)

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,F,0.76,0.12,0.44,0.33
id_1,M,1.76,0.4,0.98,2.24
id_2,F,1.49,-0.21,0.31,-0.85
id_2,M,1.87,-0.98,0.95,-0.15
id_3,F,-2.55,0.65,0.86,-0.74
id_3,M,-0.1,0.41,0.14,1.45


In [36]:
df6.sort_index(axis=1, level=0)

Unnamed: 0_level_0,Cidx2,C,C,D,D
Unnamed: 0_level_1,Cidx1,A,B,A,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.98,0.4,2.24
M,id_2,1.87,0.95,-0.98,-0.15
M,id_3,-0.1,0.14,0.41,1.45
F,id_1,0.76,0.44,0.12,0.33
F,id_2,1.49,0.31,-0.21,-0.85
F,id_3,-2.55,0.86,0.65,-0.74


## 5. Combine DataFrames

### 5.1. `merge`
* Combine based on shared column(or index) $\rightarrow$ key
* `how =`: inner(default) / left / right / outer
* `on`: the column to be merged on (if not specified, the dataframes will be merged on every shared column)
* `_x` or `_y`: suffixes applied to overlapping columns
* `left_on` & `right_on`: when the columns to be merged on have different names
  * `pd.merge(df1, df2, left_on='A', right_on="B")`
* `left_index` or `right_index`: set `True` to merge on index

In [8]:
df1 = pd.DataFrame({
    'ID': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    'Name': ['James', 'Jason', 'John', 'Juliet', 'Jean', 'June', 'Jake']
}, columns=['ID', 'Name'])

df2 = pd.DataFrame({
    'ID': [1001, 1002, 1003, 1004, 1005, 1008, 1009],
    'Sex': ['Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male']
}, columns=['ID', 'Sex'])

pd.merge(df1, df2)

Unnamed: 0,ID,Name,Sex
0,1001,James,Male
1,1002,Jason,Male
2,1003,John,Male
3,1004,Juliet,Female
4,1005,Jean,Female


#### `how='left'`

In [10]:
pd.merge(df1, df2, how='left')

Unnamed: 0,ID,Name,Sex
0,1001,James,Male
1,1002,Jason,Male
2,1003,John,Male
3,1004,Juliet,Female
4,1005,Jean,Female
5,1006,June,
6,1007,Jake,


#### `how='right'`

In [11]:
pd.merge(df1, df2, how='right')

Unnamed: 0,ID,Name,Sex
0,1001,James,Male
1,1002,Jason,Male
2,1003,John,Male
3,1004,Juliet,Female
4,1005,Jean,Female
5,1008,,Female
6,1009,,Male


#### `how='outer'`

In [9]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,ID,Name,Sex
0,1001,James,Male
1,1002,Jason,Male
2,1003,John,Male
3,1004,Juliet,Female
4,1005,Jean,Female
5,1006,June,
6,1007,Jake,
7,1008,,Female
8,1009,,Male


#### When there are 1+ data for same key value $\rightarrow$ create every possible combination

In [13]:
df1 = pd.DataFrame({
    'Type': ['setosa', 'setosa', 'virginica', 'virginica'],
    'Petal Length': [1.4, 1.3, 1.5, 1.3]},
    columns=['Type', 'Petal Length'])
df1

Unnamed: 0,Type,Petal Length
0,setosa,1.4
1,setosa,1.3
2,virginica,1.5
3,virginica,1.3


In [12]:
df2 = pd.DataFrame({
    'Type': ['setosa', 'virginica', 'virginica', 'versicolor'],
    'Petal Width': [0.4, 0.3, 0.5, 0.3]},
    columns=['Type', 'Petal Width'])
df2

Unnamed: 0,Type,Petal Width
0,setosa,0.4
1,virginica,0.3
2,virginica,0.5
3,versicolor,0.3


In [14]:
pd.merge(df1, df2)

Unnamed: 0,Type,Petal Length,Petal Width
0,setosa,1.4,0.4
1,setosa,1.3,0.4
2,virginica,1.5,0.3
3,virginica,1.5,0.5
4,virginica,1.3,0.3
5,virginica,1.3,0.5


### 5.2. `join`
* An alternative for `merge`

In [21]:
df1.join(df2, on=['Type'], lsuffix='_x', rsuffix='_y')

Unnamed: 0,Type_x,Petal Length,Type_y,Petal Width
0,setosa,1.4,,
1,setosa,1.3,,
2,virginica,1.5,,
3,virginica,1.3,,


### 5.3. `concat`
* Simply concatenates DataFrames

In [22]:
s1 = pd.Series([0, 1], index=['A', 'B'])
s2 = pd.Series([2, 3, 4], index=['A', 'B', 'C'])

In [23]:
s1

A    0
B    1
dtype: int64

In [24]:
s2

A    2
B    3
C    4
dtype: int64

In [25]:
pd.concat([s1, s2])

A    0
B    1
A    2
B    3
C    4
dtype: int64

#### set `axis=1` to stack horizontally

In [28]:
df1 = pd.DataFrame(
    np.arange(6).reshape(3, 2), 
    index=['a', 'b', 'c'], 
    columns=['Data1', 'Data2'])
df1

Unnamed: 0,Data1,Data2
a,0,1
b,2,3
c,4,5


In [29]:
df2 = pd.DataFrame(
    5 + np.arange(4).reshape(2, 2),
    index=['a', 'c'],
    columns=['Data3', 'Data4'])
df2

Unnamed: 0,Data3,Data4
a,5,6
c,7,8


In [30]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,Data1,Data2,Data3,Data4
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


## 6. Pivot Table & Group Analysis

### 6.1. `pivot`
* Selecting data based on **two columns**
* 3 Parameters
  * 1st Column $\rightarrow$ Row index
  * 2nd Column $\rightarrow$ Column index
  * 3rd Column $\rightarrow$ Data
* Fill `NaN` for non-existing data points

In [31]:
data = {
    "도시": [
        "서울", "서울", "서울", "부산", "부산", "부산", "인천", "인천"
    ],
    "연도": [
        "2015", "2010", "2005", "2015", "2010", "2005", "2015", "2010"
    ],
    "인구": [
        9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 2632035
    ],
    "지역": [
        "수도권", "수도권", "수도권", "경상권", "경상권", "경상권", "수도권", "수도권"
    ]
}
columns = ["도시", "연도", "인구", "지역"]
df1 = pd.DataFrame(data, columns=columns)
df1

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,2632035,수도권


In [32]:
df1.pivot("도시", "연도", "인구")

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


#### Equivalent way of making pivot table - using `set_index` and `unstack`

In [33]:
df1.set_index(["도시", "연도"])[["인구"]].unstack()

Unnamed: 0_level_0,인구,인구,인구
연도,2005,2010,2015
도시,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


#### Each data point should be uniquely defined based on row & column index
* If not, error occurs

In [34]:
df1.pivot("지역", "연도", "인구") #Error

ValueError: Index contains duplicate entries, cannot reshape

### 6.2. `groupby`
* **Group Analysis**
  * If data points are not uniquely defined based on row & column index,
    * $\rightarrow$ **group analysis** should be done for each group
  * How To
    * 1. Use `groupby` method on `Series` or `DataFrame`
    * 2. Apply **group operation** on `group` object
* **`groupby` method**
  * Parameters
    * List of Rows (or Columns)
    * Row index
  * Returns
    * `GroupBy` class object $\rightarrow$ group operation methods available 
    * [available methods](https://pandas.pydata.org/pandas-docs/stable/api.html#groupby)
  * Frequently-used group operation methods
    * `size()`, `count()`
    * `mean()`, `median()`, `min()`, `max()`
    * `sum()`, `prod()`, `std()`, `var()`, `quantile()`
    * `first()`, `last()`
    * `agg()`, `aggregate()` #customized function OR list of customized functions (in `str`)
    * `describe()` #returns DataFrame
    * `apply()` #returns DataFrame with customized functions
    * `transform()` #transforme data using group operations

#### Example 1 - Average of `data1` vs `key1`

In [35]:
np.random.seed(0)
df2 = pd.DataFrame({
    'key1': ['A', 'A', 'B', 'B', 'A'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': [1, 2, 3, 4, 5],
    'data2': [10, 20, 30, 40, 50]
})
df2

Unnamed: 0,data1,data2,key1,key2
0,1,10,A,one
1,2,20,A,two
2,3,30,B,one
3,4,40,B,two
4,5,50,A,one


#### `groupby` & `mean`

In [40]:
df2.groupby(df2.key1).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.666667,26.666667
B,3.5,35.0


#### If `data2` is unnecessary,

In [43]:
df2.data1.groupby(df2.key1).mean()

#or equivalently
df2.groupby(df2.key1)["data1"].mean()

#or equivalently
df2.groupby(df2.key1).mean()["data1"]

key1
A    2.666667
B    3.500000
Name: data1, dtype: float64

#### Example 2 - Average of `data1` vs `(key1, key2)`
* Use list of keys
* The result is similar to the pivot table

In [44]:
df2.data1.groupby([df2.key1, df2.key2]).sum()

key1  key2
A     one     6
      two     2
B     one     3
      two     4
Name: data1, dtype: int64

#### Use `unstack` to better visualize the result

In [45]:
df2.data1.groupby([df2["key1"], df2["key2"]]).sum().unstack("key2")

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,6,2
B,3,4


#### Example 3 - Customized function
* Iris data $\rightarrow$ calculate max/min

In [48]:
#import seaborn as sns
iris = sns.load_dataset("iris")

#Customized function
def peak_to_peak_ratio(x):
    return x.max() / x.min()

#Use agg() method
iris.groupby(iris.species).agg(peak_to_peak_ratio)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.348837,1.913043,1.9,6.0
versicolor,1.428571,1.7,1.7,1.8
virginica,1.612245,1.727273,1.533333,1.785714


#### Example 4 - Use `describe` to obtain descriptive statstics

In [49]:
iris.groupby(iris.species).describe().T

Unnamed: 0,species,setosa,versicolor,virginica
petal_length,count,50.0,50.0,50.0
petal_length,mean,1.462,4.26,5.552
petal_length,std,0.173664,0.469911,0.551895
petal_length,min,1.0,3.0,4.5
petal_length,25%,1.4,4.0,5.1
petal_length,50%,1.5,4.35,5.55
petal_length,75%,1.575,4.6,5.875
petal_length,max,1.9,5.1,6.9
petal_width,count,50.0,50.0,50.0
petal_width,mean,0.246,1.326,2.026


#### Example 5 - Use `apply` to create `DataFrame` for each group - Similar to `describe)

In [51]:
#Returns 3 biggest data points
def max3(df):
    return df.sort_values(by="petal_length")[:3]

#apply
iris.groupby(iris.species).apply(max3)

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_length,sepal_width,petal_length,petal_width,species
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
setosa,22,4.6,3.6,1.0,0.2,setosa
setosa,13,4.3,3.0,1.1,0.1,setosa
setosa,14,5.8,4.0,1.2,0.2,setosa
versicolor,98,5.1,2.5,3.0,1.1,versicolor
versicolor,93,5.0,2.3,3.3,1.0,versicolor
versicolor,57,4.9,2.4,3.3,1.0,versicolor
virginica,106,4.9,2.5,4.5,1.7,virginica
virginica,126,6.2,2.8,4.8,1.8,virginica
virginica,138,6.0,3.0,4.8,1.8,virginica


#### Example 6 - Use `transform` to directly manipulate data

In [52]:
#Returns size
def q3cut(s):
    return pd.qcut(s, 3, labels=["S", "M", "L"])
    
iris["petal_length_class"] = iris.groupby(iris.species)["petal_length"].transform(q3cut)
iris[["petal_length", "petal_length_class"]].tail(10)

Unnamed: 0,petal_length,petal_length_class
140,5.6,M
141,5.1,S
142,5.1,S
143,5.9,L
144,5.7,M
145,5.2,S
146,5.0,S
147,5.2,S
148,5.4,M
149,5.1,S


## 6.3. `pivot_table` (`groupby` + `pivot`)
* `pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, margins_name='All')`
* **Parameters:**
  * `data`: DataFrame to analyze / not needed when used as method
  * `values`: Column to analyze
  * `index`: Key (or list of keys) that will be used as **row index**
  * `columns`: Key (or list of keys) that will be used as **column index**
  * `aggfunc`: Function to be applied
  * `fill_value`: Value to replace missing values with
  * `margins`: Add all row / columns (e.g. for subtotal / grand totals)
  * `margins_name`: name of the margins 

#### Example

In [54]:
df2.pivot_table("data1", "key1", "key2", margins=True, margins_name="Avg")

key2,one,two,Avg
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,3,2,2.666667
B,3,4,3.5
Avg,3,3,3.0


#### Multi-index table

In [22]:
df2.pivot_table("data1", index=["key1", "key2"])

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
A,one,3
A,two,2
B,one,3
B,two,4


## 7. Time Series

### 7.1. `DatetimeIndex`
* Data type for Datetime
* Generate with `pd.to_datetime` or `pd.date_range`

#### `pd.to_datetime`

In [55]:
date_str = ["2018, 1, 1", "2018, 1, 4", "2018, 1, 5", "2018, 1, 6"]
idx = pd.to_datetime(date_str)
idx

DatetimeIndex(['2018-01-01', '2018-01-04', '2018-01-05', '2018-01-06'], dtype='datetime64[ns]', freq=None)

In [2]:
np.random.seed(0)
s = pd.Series(np.random.randn(4), index=idx)
s

2018-01-01    1.764052
2018-01-04    0.400157
2018-01-05    0.978738
2018-01-06    2.240893
dtype: float64

#### `pd.date_range`
* First date & Last date $\rightarrow$ range of dates
* `freq` parameter: specify how to generate data_range
  * `s`: second
  * `T`: minute
  * `H`: hour
  * `D`: day
  * `B`: weekday
  * `W`: week(Sunday)
  * `W-MON`: week(Monday)
  * `M`: last day of each month
  * `MS`: first day of each month
  * `BM`: last weekday of each month
  * `BMS`: first weekday of each month
  * `WOM-2THU`: second Thursday of each month
  * `Q-JAN`: last day of the first month of each quarter
  * `Q-DEC`: last day of the last month of each quarter
  * [more info](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases)

In [56]:
pd.date_range("2018-4-1", "2018-4-30")

# or equivalently
pd.date_range(start="2018-4-1", periods=30)

# or equivalently
pd.date_range("2018-4-1", "2018-4-30", freq="D")

DatetimeIndex(['2018-04-01', '2018-04-02', '2018-04-03', '2018-04-04',
               '2018-04-05', '2018-04-06', '2018-04-07', '2018-04-08',
               '2018-04-09', '2018-04-10', '2018-04-11', '2018-04-12',
               '2018-04-13', '2018-04-14', '2018-04-15', '2018-04-16',
               '2018-04-17', '2018-04-18', '2018-04-19', '2018-04-20',
               '2018-04-21', '2018-04-22', '2018-04-23', '2018-04-24',
               '2018-04-25', '2018-04-26', '2018-04-27', '2018-04-28',
               '2018-04-29', '2018-04-30'],
              dtype='datetime64[ns]', freq='D')

### 7.2. `shift`
* shift data
* shift datetime (`freq` should be specified)

In [64]:
np.random.seed(0)
ts = pd.Series(np.random.randn(4), index=pd.date_range("2018-1-1", periods=4, freq="M"))
ts

2018-01-31    1.764052
2018-02-28    0.400157
2018-03-31    0.978738
2018-04-30    2.240893
Freq: M, dtype: float64

In [65]:
ts.shift(2) #shift data-only

2018-01-31         NaN
2018-02-28         NaN
2018-03-31    1.764052
2018-04-30    0.400157
Freq: M, dtype: float64

In [66]:
ts.shift(1, freq="M") #shift date-only

2018-02-28    1.764052
2018-03-31    0.400157
2018-04-30    0.978738
2018-05-31    2.240893
Freq: M, dtype: float64

### 7.3. `resample`
* down-sampling: data decrease $\rightarrow$ group operation required
* up-sampling: data increase $\rightarrow$ data augmentation required

In [80]:
np.random.seed(0)
ts = pd.Series(np.random.randn(100), index=pd.date_range("2018-1-1", periods=100, freq="D"))
ts.tail(20)

2018-03-22   -1.165150
2018-03-23    0.900826
2018-03-24    0.465662
2018-03-25   -1.536244
2018-03-26    1.488252
2018-03-27    1.895889
2018-03-28    1.178780
2018-03-29   -0.179925
2018-03-30   -1.070753
2018-03-31    1.054452
2018-04-01   -0.403177
2018-04-02    1.222445
2018-04-03    0.208275
2018-04-04    0.976639
2018-04-05    0.356366
2018-04-06    0.706573
2018-04-07    0.010500
2018-04-08    1.785870
2018-04-09    0.126912
2018-04-10    0.401989
Freq: D, dtype: float64

#### down-sampling
* group operation required

In [87]:
ts.resample('10D').first()
#ts.resample('10D').mean()

2018-01-01    1.764052
2018-01-11    0.144044
2018-01-21   -2.552990
2018-01-31    0.154947
2018-02-10   -1.048553
2018-02-20   -0.895467
2018-03-02   -0.672460
2018-03-12    0.729091
2018-03-22   -1.165150
2018-04-01   -0.403177
dtype: float64

#### set `closed='right'` to include right limit in each group

In [86]:
ts.resample('10D', closed="right").first()

2017-12-22    1.764052
2018-01-01    0.400157
2018-01-11    1.454274
2018-01-21    0.653619
2018-01-31    0.378163
2018-02-10   -1.420018
2018-02-20    0.386902
2018-03-02   -0.359553
2018-03-12    0.128983
2018-03-22    0.900826
2018-04-01    1.222445
dtype: float64

#### `ohlc` method - return open, high, low, close

In [89]:
ts.resample('10D').ohlc()

Unnamed: 0,open,high,low,close
2018-01-01,1.764052,2.240893,-0.977278,0.410599
2018-01-11,0.144044,1.494079,-0.854096,-0.854096
2018-01-21,-2.55299,2.269755,-2.55299,1.469359
2018-01-31,0.154947,1.230291,-1.980796,-0.302303
2018-02-10,-1.048553,1.950775,-1.70627,-0.21274
2018-02-20,-0.895467,0.428332,-1.180632,-0.362741
2018-03-02,-0.67246,0.462782,-1.726283,0.051945
2018-03-12,0.729091,1.139401,-1.234826,0.056165
2018-03-22,-1.16515,1.895889,-1.536244,1.054452
2018-04-01,-0.403177,1.78587,-0.403177,0.401989


#### up-sampling
* forward filling: propagate LAST valid observation forward / **`ffill` method**
* backward filling: use NEXT valid observation to fill gap / **`bfill` method**

In [90]:
ts.resample('30s').ffill().head(10)

2018-01-01 00:00:00    1.764052
2018-01-01 00:00:30    1.764052
2018-01-01 00:01:00    1.764052
2018-01-01 00:01:30    1.764052
2018-01-01 00:02:00    1.764052
2018-01-01 00:02:30    1.764052
2018-01-01 00:03:00    1.764052
2018-01-01 00:03:30    1.764052
2018-01-01 00:04:00    1.764052
2018-01-01 00:04:30    1.764052
Freq: 30S, dtype: float64

In [91]:
ts.resample('30s').bfill().head(10)

2018-01-01 00:00:00    1.764052
2018-01-01 00:00:30    0.400157
2018-01-01 00:01:00    0.400157
2018-01-01 00:01:30    0.400157
2018-01-01 00:02:00    0.400157
2018-01-01 00:02:30    0.400157
2018-01-01 00:03:00    0.400157
2018-01-01 00:03:30    0.400157
2018-01-01 00:04:00    0.400157
2018-01-01 00:04:30    0.400157
Freq: 30S, dtype: float64

## 8. Read & Write Files

### 8.1. Supported File Formats
* CSV
* Excel
* HTML
* JSON
* HDF5
* SAS
* STATA
* SQL

### 8.2. Read CSV

#### Create a CSV file using `%%writefile`

In [34]:
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample1.csv


#### Read from CSV

In [35]:
pd.read_csv('sample1.csv')

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Specify column index

In [36]:
%%writefile sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample2.csv


In [37]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Choose a column & Use it as row index

In [38]:
pd.read_csv('sample1.csv', index_col='c1')

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


### 8.3. Read Text
* Use `sep` parameter
* `\s+`: blank of varying length

In [39]:
%%writefile sample3.txt
c1        c2        c3        c4 
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515

Writing sample3.txt


In [40]:
pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


#### Skip Rows

In [41]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명: 
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [42]:
pd.read_csv('sample4.txt', skiprows=[0, 1])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### NaN Values

In [44]:
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2,, two
N, 3.33, three

Writing sample5.csv


In [45]:
df = pd.read_csv('sample5.csv', na_values=['N'])
df

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


### 8.4 Write CSV or Text

In [53]:
df.to_csv('sample6.csv')

#### `sep` parameter can be customized

In [49]:
df.to_csv('sample7.txt', sep='|')

In [50]:
!cat sample7.txt

|c1| c2| c3
0|1.0|1.11| one
1|2.0|| two
2||3.33| three


#### Check the file content using `!cat` (use `!type` in Windows)

In [48]:
!cat sample6.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,, two
2,,3.33, three


#### NaN values

In [51]:
df.to_csv('sample8.csv', na_rep='NN')

In [52]:
!cat sample8.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,NN, two
2,NN,3.33, three


#### `index` & `header` parameters

In [19]:
df.index = ["a", "b", "c"]
df

Unnamed: 0,c1,c2,c3
a,1.0,1.11,one
b,2.0,,two
c,,3.33,three


In [20]:
df.to_csv('sample9.csv', index=False, header=False)

In [21]:
!cat sample9.csv

1.0,1.11, one
2.0,, two
,3.33, three
