In [2]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Table of Contents
>## 1. `Series` class
* 1.1. Generate `Series`
* 1.2. `Series` - Update
* 1.3. `Series` - Indexing
* 1.4. `Series` - Slicing
* 1.5. `Series` - Treat as `Dictionary`
* 1.6. `Series` - Operations

>## 2. `DataFrame` class
* 2.1. Generate `DataFrame`
* 2.2. `DataFrame` - Indexing
* 2.3. `DataFrame` - Update
* 2.4. `DataFrame` - Indexer

>## 3. `Series` and `DataFrame` - Operations
* 3.1. Count
* 3.2. Count Categorical Values
* 3.3. Sort
* 3.4. Sum
* 3.5. `apply`
* 3.6. Real values $\rightarrow$ Categorical values

>## 4. `index` - Operations
* 4.1. `set_index` and `reset_index`
* 4.2. Multi-index
* 4.3. Column index $\leftrightarrow$ Row index
* 4.4. Indexing Multi-index
* 4.5. swaplevel
* 4.6. sort_index

>## 5. Read & Write Files
* 5.1. Supported File Formats
* 5.2. Read CSV
* 5.3. Read Text
* 5.4. Write CSV or Text

## 1. `Series` class
* Series = value + index

### 1.1. Generate `Series`
* `pd.Series()` - generates series
* `s.index` & `s.values` - return values
* `s.index.name` & `s.name` - set names
* `index`
  * default = 0, 1, ...
  * it can be string, datetime, integer, ...
  * its length should match the length of data

In [32]:
s1 = pd.Series([9904312, 3448737, 2890451, 2466052], index=["Seoul", "Busan", "Incheon", "Daegu"])
print(s1, s1.index, s1.values, sep='\n\n')

Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
dtype: int64

Index(['Seoul', 'Busan', 'Incheon', 'Daegu'], dtype='object')

[9904312 3448737 2890451 2466052]


In [33]:
s1.name = "Population"
s1.index.name = "City"
s1

City
Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.2. `Series` - Update

In [52]:
s1["Busan"] = 1.63
s1

City
Seoul      9904312
Busan            1
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

In [54]:
del s1['Busan']
s1

City
Seoul      9904312
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.3. `Series` - Indexing

In [35]:
s1[1]
s1["Busan"]
s1.Busan #only when the index label is English

3448737

In [36]:
s1[[0, 3, 1]]
s1[['Seoul', 'Daegu', 'Busan']]

City
Seoul    9904312
Daegu    2466052
Busan    3448737
Name: Population, dtype: int64

In [37]:
s1[(250e4 < s1) & (s1 < 500e4)]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

### 1.4. `Series` - Slicing
* Slicing with **integer** label $\rightarrow$ the latter is **excluded**
* Slicing with **string** label $\rightarrow$ the latter is **included**

In [38]:
s1[1:3]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

In [39]:
s1["Busan":"Daegu"]

City
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.5. `Series` - Treat as `Dictionary`

시리즈 객체는 라벨 값에 의해 인덱싱이 가능하므로 실질적으로 라벨 값을 키(key)로 가지는 딕셔너리 자료형과 같다고 볼 수 있다. 따라서 딕셔너리 자료형에서 제공하는 ``in`` 연산도 가능하고 ``items`` 메서드를 사용하면 ``for`` 루프를 통해 각 원소의 키(key)와 값(value)을 접근할 수도 있다.

In [40]:
"Seoul" in s1

True

In [41]:
for k, v in s1.items():
    print("%s = %d" % (k, v))

Seoul = 9904312
Busan = 3448737
Incheon = 2890451
Daegu = 2466052


#### Generate `Series` from `Dictionary`

In [42]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158})
s2 #dictionary object does not have order - to specify order, specify index

Busan      3393191
Daejeon    1490158
Incheon    2632035
Seoul      9631482
dtype: int64

In [43]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158}, 
               index=["Busan", "Seoul", "Incheon", "Daejeon"])
s2

Busan      3393191
Seoul      9631482
Incheon    2632035
Daejeon    1490158
dtype: int64

### 1.6. `Series` - Operations
* Operations are applied only to the **values** (**index** is invariant)

In [34]:
s1 / 1000000

City
Seoul      9.904312
Busan      3.448737
Incheon    2.890451
Daegu      2.466052
Name: Population, dtype: float64

#### Operation between multiple `Series` $\rightarrow$ Operation applied only to the data with the same index

In [48]:
s1.values - s2.values #not something we need

array([ 6511121, -6182745,   258416,   975894])

In [47]:
ds = s1 - s2
ds

Busan       55546.0
Daegu           NaN
Daejeon         NaN
Incheon    258416.0
Seoul      272830.0
dtype: float64

In [49]:
ds.notnull()

Busan       True
Daegu      False
Daejeon    False
Incheon     True
Seoul       True
dtype: bool

In [50]:
ds[ds.notnull()]

Busan       55546.0
Incheon    258416.0
Seoul      272830.0
dtype: float64

## 2. `DataFrame` class

### 2.1. Generate  `DataFrame`

In [4]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}

columns = ["지역", "2015", "2010", "2005", "2000", "2010-2015 증가율"]

index = ["서울", "부산", "인천", "대구"]

df = pd.DataFrame(data, index=index, columns=columns)

df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Access data

In [5]:
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [6]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [7]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

#### Set names

In [8]:
df.index.name = "도시"
df.columns.name = "특성"
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Transpose

In [9]:
df.T

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


### 2.2. `DataFrame` - Indexing

In [10]:
df["지역"] #Series

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

#### To preserve `DataFrame` datatype, use list

In [12]:
df[["지역"]] #DataFrame

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [11]:
df[["2010", "2015"]]

특성,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


#### Integer Indexing - cannot be used when column index has string label

In [13]:
df[0]

KeyError: 0

#### Integer Indexing - can be used when column index has integer label

In [16]:
df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
df2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [17]:
df2[2]

0     2
1     6
2    10
Name: 2, dtype: int64

#### Single Data Indexing - indexing with column label & indexing with row label

In [21]:
df["2015"]["서울"]

9904312

#### Row Indexing - use slicing

In [22]:
df[:1]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [51]:
df[1:3]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54


In [52]:
df["서울":"부산"]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4


### 2.3. `DataFrame` - Update

In [18]:
df["2010-2015 증가율"] = df["2010-2015 증가율"] * 100
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2431774,2456016,2473990,1.41


In [19]:
df["2005-2010 증가율"] = ((df["2010"] - df["2005"]) / df["2005"] * 100).round(2)
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83,-1.34
부산,경상권,3448737,3393191,3512547,3655437,1.63,-3.4
인천,수도권,2890451,2632035,2517680,2466338,9.82,4.54
대구,경상권,2466052,2431774,2456016,2473990,1.41,-0.99


In [20]:
del df["2010-2015 증가율"]
df

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


### 2.4. `DataFrame` - Indexer
* Enables 2D indexing (row index, column index)
* Indexers:
  * `loc` : Label-based 2D indexing
  * `iloc` : Order-2D integer indexing
  * `at`: Label-based 2D indexing (single scalar value)
  * `iat` : Order-based 2D integer indexing (single scalar value)

#### `loc` indexer

In [55]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                  index=["a", "b", "c"],
                  columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [58]:
df.loc["a", "A"]

10

In [7]:
df.loc["a", :] #extract one row as Series
#df[:1] returns DataFrame

A    10
B    11
C    12
D    13
Name: a, dtype: int64

In [3]:
df.loc["b":, "A"] #slicing

b    14
c    18
Name: A, dtype: int64

In [5]:
df.loc[["a", "b"], ["B", "D"]]

Unnamed: 0,B,D
a,11,13
b,15,17


In [6]:
df.loc[df.A > 10, :]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [60]:
def select_rows(df): #use function
    return df.A > 12

select_rows(df)

a    False
b     True
c     True
Name: A, dtype: bool

In [61]:
df.loc[select_rows(df), ["B"]]

Unnamed: 0,B
b,15
c,19


In [62]:
# df.loc[:, df[:1] <= 11]  # Wrong
df.loc[:, df.loc["a", :] <= 11] # Right

Unnamed: 0,A,B
a,10,11
b,14,15
c,18,19


In [14]:
df.loc["e"] = [90, 91, 92, 93]
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,90,91,92,93


#### `loc` indexer - when indices are integers
* In such case, the indices are treated as order-less

In [63]:
df2 = pd.DataFrame(np.arange(10, 26).reshape(4, 4), columns=np.arange(1, 8, 2))
df2

Unnamed: 0,1,3,5,7
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [64]:
df2.loc[1, 1]

14

In [65]:
df2.loc[1:2, :]

Unnamed: 0,1,3,5,7
1,14,15,16,17
2,18,19,20,21


#### `iloc` indexer

In [18]:
df.iloc[0, 1]

11

In [19]:
df.iloc[:2, 2]

a    12
b    16
Name: C, dtype: int64

In [20]:
df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int64

In [21]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [22]:
df.iloc[-1]

A    90
B    91
C    92
D    93
Name: e, dtype: int64

In [23]:
df.iloc[-1] = df.iloc[-1] * 2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,180,182,184,186


#### `at` & `iat` indexer
* Used to extract single scalar value
* Faster than `loc` & `iloc`

In [66]:
%timeit df.loc["a", "A"]

5.78 µs ± 329 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [67]:
%timeit df.at["a", "A"]

4.36 µs ± 435 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [68]:
%timeit df.iloc[0, 0]

7.52 µs ± 336 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [69]:
%timeit df.iat[0, 0]

4.76 µs ± 314 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 3. `Series` and `DataFrame` - Operations

### 3.1. Count
* Disregard NaN

#### Series

In [1]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [2]:
s.count()

9

#### DataFrame

In [3]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4, 4)), dtype=float)
df.iloc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [4]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

### 3.2. Count Categorical Values
* Use for integer, string, categorical values

#### Series - use `value_counts`

In [5]:
np.random.seed(1)
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    4
96    5
97    2
98    4
99    3
dtype: int64

In [6]:
s2.value_counts()

1    22
0    18
4    17
5    16
3    14
2    13
dtype: int64

#### DataFrame - use `value_count` for each column

In [7]:
df[0].value_counts()

3.0    2
4.0    1
0.0    1
Name: 0, dtype: int64

### 3.3. Sort
* `sort_index` - sort by index
* `sort_values` - sort by values
* In both cases `NaN` values goes to the end

In [8]:
s2.value_counts().sort_index()

0    18
1    22
2    13
3    14
4    17
5    16
dtype: int64

In [9]:
s.sort_values()

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

#### `ascending=False` to sort in opposite direction

In [10]:
s.sort_values(ascending=False)

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

#### `sort_values` - specify the column by which the data should be sorted

In [11]:
df.sort_values(by=1)

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


#### `by` parameter can be list - priority of sort

In [12]:
df.sort_values(by=[1, 2])

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


### 3.4. Sum
* Use `sum(axis)`
  * `axis=1` for row sum
  * `axis=0` for column sum (default)

In [13]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10, size=(4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,5,8,9,5,0,0,1,7
1,6,9,2,4,5,2,4,2
2,4,7,7,9,1,7,0,6
3,9,9,7,6,9,1,0,1


In [14]:
df2.sum(axis=1)

0    35
1    34
2    41
3    42
dtype: int64

In [15]:
df2["RowSum"] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5,8,9,5,0,0,1,7,35
1,6,9,2,4,5,2,4,2,34
2,4,7,7,9,1,7,0,6,41
3,9,9,7,6,9,1,0,1,42


In [16]:
df2.sum()

0          24
1          33
2          25
3          24
4          15
5          10
6           5
7          16
RowSum    152
dtype: int64

In [17]:
df2.loc["ColTotal", :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,5.0,8.0,9.0,5.0,0.0,0.0,1.0,7.0,35.0
1,6.0,9.0,2.0,4.0,5.0,2.0,4.0,2.0,34.0
2,4.0,7.0,7.0,9.0,1.0,7.0,0.0,6.0,41.0
3,9.0,9.0,7.0,6.0,9.0,1.0,0.0,1.0,42.0
ColTotal,24.0,33.0,25.0,24.0,15.0,10.0,5.0,16.0,152.0


### 3.5. `apply`

In [3]:
df3 = pd.DataFrame({
        'A': [1, 3, 4, 3, 4],
        'B': [2, 3, 1, 2, 3],
        'C': [1, 5, 2, 4, 4]
    })
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


#### use `lambda` function

In [19]:
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

#### set `axis=1` to apply to rows

In [20]:
df3.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

#### use `value_counts`

In [21]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


#### use `fillna` to fill NaN values & use `astype` to change data type

In [22]:
df3.apply(pd.value_counts).fillna(0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


### 3.6. Real values $\rightarrow$ Categorical values
#### `cut`: set threshold
  * `bins` parameter can be:
    * **int** : Defines the number of equal-width bins in the range of x 
      * The range of x is extended by .1% on each side to include the minimum and maximum values of x
    * **sequence of scalars** : Defines the bin edges allowing for non-uniform width
      * No extension of the range of x is done.
    * **IntervalIndex** : Defines the exact bins to be used.

In [5]:
ages = [0, 2, 10, 21, 23, 37, 31, 61, 20, 41, 32, 100]

In [7]:
bins = [1, 30, 60, 99]
labels = ["Young", "Middle-Age", "Old"]
cats = pd.cut(ages, bins, labels=labels)
cats

[NaN, Young, Young, Young, Young, ..., Old, Young, Middle-Age, Middle-Age, NaN]
Length: 12
Categories (3, object): [Young < Middle-Age < Old]

#### `cut` method returns `Cetagorical` class object

In [12]:
type(cats)

pandas.core.categorical.Categorical

In [13]:
cats.categories

Index(['Young', 'Middle-Age', 'Old'], dtype='object')

In [14]:
cats.codes

array([-1,  0,  0,  0,  0,  1,  1,  2,  0,  1,  1, -1], dtype=int8)

In [15]:
df4 = pd.DataFrame(ages, columns=["ages"])
df4["age_cat"] = pd.cut(df4.ages, bins, labels=labels)
df4

Unnamed: 0,ages,age_cat
0,0,
1,2,Young
2,10,Young
3,21,Young
4,23,Young
5,37,Middle-Age
6,31,Middle-Age
7,61,Old
8,20,Young
9,41,Middle-Age


#### `qcut`: discretize variable into equal-sized buckets

In [16]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4, labels=["Q1", "Q2", "Q3", "Q4"])
cats

[Q3, Q2, Q1, Q1, Q3, ..., Q4, Q2, Q2, Q2, Q2]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [17]:
pd.value_counts(cats)

Q4    250
Q3    250
Q2    250
Q1    250
dtype: int64

## 4. `index` - Operations

### 4.1. `set_index` and `reset_index`

#### `set_index`
* delete original index
* one of columns $\rightarrow$ index

In [19]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'), 
                              np.round(np.random.rand(3, 5), 2)]).T,
                  columns=["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [20]:
df2 = df1.set_index("C1")
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


#### `reset_index`
* original index $\rightarrow$ first column
* `drop=True` to drop the original index

In [21]:
df2.reset_index()

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [5]:
df2.reset_index(drop=True)

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07


### 4.2. Multi-index
#### Column index: assign "list of lists" to `columns`

In [22]:
np.random.seed(0)
df3 = pd.DataFrame(np.round(np.random.randn(5, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C1", "C2", "C1", "C2"]])
df3

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### use `names` attribute of `columns` object to set names

In [23]:
df3.columns.names = ["Cidx1", "Cidx2"]
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### Row index: assign "list of lists" to `index`
* use `names` attribute of `index` object to set names

In [24]:
np.random.seed(0)
df4 = pd.DataFrame(np.round(np.random.randn(6, 4), 2),
                   columns=[["A", "A", "B", "B"],
                            ["C", "D", "C", "D"]],
                   index=[["M", "M", "M", "F", "F", "F"],
                          ["id_" + str(i + 1) for i in range(3)] * 2])
df4.columns.names = ["Cidx1", "Cidx2"]
df4.index.names = ["Ridx1", "Ridx2"]
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


### 4.3. Column index $\leftrightarrow$ Row index
* `stack()`: Column index $\rightarrow$ Row index
* `unstack()`: Row index $\rightarrow$ COlumn index

In [25]:
df4.stack("Cidx1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Cidx2,C,D
Ridx1,Ridx2,Cidx1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,id_1,A,1.76,0.4
M,id_1,B,0.98,2.24
M,id_2,A,1.87,-0.98
M,id_2,B,0.95,-0.15
M,id_3,A,-0.1,0.41
M,id_3,B,0.14,1.45
F,id_1,A,0.76,0.12
F,id_1,B,0.44,0.33
F,id_2,A,1.49,-0.21
F,id_2,B,0.31,-0.85


In [27]:
df4.unstack("Ridx2")

Cidx1,A,A,A,A,A,A,B,B,B,B,B,B
Cidx2,C,C,C,D,D,D,C,C,C,D,D,D
Ridx2,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3,id_1,id_2,id_3
Ridx1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
F,0.76,1.49,-2.55,0.12,-0.21,0.65,0.44,0.31,0.86,0.33,-0.85,-0.74
M,1.76,1.87,-0.1,0.4,-0.98,0.41,0.98,0.95,0.14,2.24,-0.15,1.45


### 4.4. Indexing Multi-index
* Use `tuple`

In [29]:
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,0.98,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


In [30]:
df3[("B", "C1")]

0    0.98
1    0.95
2    0.14
3    0.44
4    0.31
Name: (B, C1), dtype: float64

In [31]:
df3.loc[0, ("B", "C1")]

0.98

In [32]:
df3.loc[0, ("B", "C1")] = 100
df3

Cidx1,A,A,B,B
Cidx2,C1,C2,C1,C2
0,1.76,0.4,100.0,2.24
1,1.87,-0.98,0.95,-0.15
2,-0.1,0.41,0.14,1.45
3,0.76,0.12,0.44,0.33
4,1.49,-0.21,0.31,-0.85


#### `iloc` indexer $\rightarrow$ tuple multi-indexing is unavailable

In [17]:
df3.iloc[0, 2]

100.0

In [19]:
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


In [20]:
df4.loc[("M", "id_1"), ("A", "C")]

1.76

In [21]:
df4.loc[:, ("A", "C")]

Ridx1  Ridx2
M      id_1     1.76
       id_2     1.87
       id_3    -0.10
F      id_1     0.76
       id_2     1.49
       id_3    -2.55
Name: (A, C), dtype: float64

In [22]:
df4.loc[("M", "id_1"), :]

Cidx1  Cidx2
A      C        1.76
       D        0.40
B      C        0.98
       D        2.24
Name: (M, id_1), dtype: float64

In [23]:
df4.loc[("All", "All"), :] = df4.sum()
df4

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74
All,All,3.23,0.39,3.68,2.28


### 4.5. `swaplevel`

* `swaplevel(i, j, axis)`
  * swap `i`th & `j`th index
  * `axis=0` for row index (default)
  * `axis=1` for column index

In [33]:
df5 = df4.swaplevel("Ridx1", "Ridx2")
df5

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,M,1.76,0.4,0.98,2.24
id_2,M,1.87,-0.98,0.95,-0.15
id_3,M,-0.1,0.41,0.14,1.45
id_1,F,0.76,0.12,0.44,0.33
id_2,F,1.49,-0.21,0.31,-0.85
id_3,F,-2.55,0.65,0.86,-0.74


In [34]:
df6 = df4.swaplevel("Cidx1", "Cidx2", 1)
df6

Unnamed: 0_level_0,Cidx2,C,D,C,D
Unnamed: 0_level_1,Cidx1,A,A,B,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.4,0.98,2.24
M,id_2,1.87,-0.98,0.95,-0.15
M,id_3,-0.1,0.41,0.14,1.45
F,id_1,0.76,0.12,0.44,0.33
F,id_2,1.49,-0.21,0.31,-0.85
F,id_3,-2.55,0.65,0.86,-0.74


### 4.6. `sort_index`
* set `axis` & `level`

In [35]:
df5.sort_index(level=0)

Unnamed: 0_level_0,Cidx1,A,A,B,B
Unnamed: 0_level_1,Cidx2,C,D,C,D
Ridx2,Ridx1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
id_1,F,0.76,0.12,0.44,0.33
id_1,M,1.76,0.4,0.98,2.24
id_2,F,1.49,-0.21,0.31,-0.85
id_2,M,1.87,-0.98,0.95,-0.15
id_3,F,-2.55,0.65,0.86,-0.74
id_3,M,-0.1,0.41,0.14,1.45


In [36]:
df6.sort_index(axis=1, level=0)

Unnamed: 0_level_0,Cidx2,C,C,D,D
Unnamed: 0_level_1,Cidx1,A,B,A,B
Ridx1,Ridx2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
M,id_1,1.76,0.98,0.4,2.24
M,id_2,1.87,0.95,-0.98,-0.15
M,id_3,-0.1,0.14,0.41,1.45
F,id_1,0.76,0.44,0.12,0.33
F,id_2,1.49,0.31,-0.21,-0.85
F,id_3,-2.55,0.86,0.65,-0.74


## 5. Read & Write Files

### 5.1. Supported File Formats
* CSV
* Excel
* HTML
* JSON
* HDF5
* SAS
* STATA
* SQL

### 5.2. Read CSV

#### Create a CSV file using `%%writefile`

In [34]:
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample1.csv


#### Read from CSV

In [35]:
pd.read_csv('sample1.csv')

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Specify column index

In [36]:
%%writefile sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample2.csv


In [37]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Choose a column & Use it as row index

In [38]:
pd.read_csv('sample1.csv', index_col='c1')

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


### 5.3. Read Text
* Use `sep` parameter
* `\s+`: blank of varying length

In [39]:
%%writefile sample3.txt
c1        c2        c3        c4 
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515

Writing sample3.txt


In [40]:
pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


#### Skip Rows

In [41]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명: 
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [42]:
pd.read_csv('sample4.txt', skiprows=[0, 1])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### NaN Values

In [44]:
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2,, two
N, 3.33, three

Writing sample5.csv


In [45]:
df = pd.read_csv('sample5.csv', na_values=['N'])
df

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


### 5.4 Write CSV or Text

In [53]:
df.to_csv('sample6.csv')

#### `sep` parameter can be customized

In [49]:
df.to_csv('sample7.txt', sep='|')

In [50]:
!cat sample7.txt

|c1| c2| c3
0|1.0|1.11| one
1|2.0|| two
2||3.33| three


#### Check the file content using `!cat` (use `!type` in Windows)

In [48]:
!cat sample6.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,, two
2,,3.33, three


#### NaN values

In [51]:
df.to_csv('sample8.csv', na_rep='NN')

In [52]:
!cat sample8.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,NN, two
2,NN,3.33, three


#### `index` & `header` parameters

In [19]:
df.index = ["a", "b", "c"]
df

Unnamed: 0,c1,c2,c3
a,1.0,1.11,one
b,2.0,,two
c,,3.33,three


In [20]:
df.to_csv('sample9.csv', index=False, header=False)

In [21]:
!cat sample9.csv

1.0,1.11, one
2.0,, two
,3.33, three
