In [2]:
import warnings
warnings.simplefilter('ignore')

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Table of Contents
>## 1. `Series` class
* 1.1. Generate `Series`
* 1.2. `Series` - Update
* 1.3. `Series` - Indexing
* 1.4. `Series` - Slicing
* 1.5. `Series` - Treat as `Dictionary`
* 1.6. `Series` - Operations

>## 2. `DataFrame` class
* 2.1. Generate `DataFrame`
* 2.2. `DataFrame` - Indexing
* 2.3. `DataFrame` - Update
* 2.4. `DataFrame` - Indexer

>## 3. Read & Write Files
* 3.1. Supported File Formats
* 3.2. Read CSV
* 3.3. Read Text
* 3.4. Write CSV or Text

## 1. `Series` class
* Series = value + index

### 1.1. Generate `Series`
* `pd.Series()` - generates series
* `s.index` & `s.values` - return values
* `s.index.name` & `s.name` - set names
* `index`
  * default = 0, 1, ...
  * it can be string, datetime, integer, ...
  * its length should match the length of data

In [32]:
s1 = pd.Series([9904312, 3448737, 2890451, 2466052], index=["Seoul", "Busan", "Incheon", "Daegu"])
print(s1, s1.index, s1.values, sep='\n\n')

Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
dtype: int64

Index(['Seoul', 'Busan', 'Incheon', 'Daegu'], dtype='object')

[9904312 3448737 2890451 2466052]


In [33]:
s1.name = "Population"
s1.index.name = "City"
s1

City
Seoul      9904312
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.2. `Series` - Update

In [52]:
s1["Busan"] = 1.63
s1

City
Seoul      9904312
Busan            1
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

In [54]:
del s1['Busan']
s1

City
Seoul      9904312
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.3. `Series` - Indexing

In [35]:
s1[1]
s1["Busan"]
s1.Busan #only when the index label is English

3448737

In [36]:
s1[[0, 3, 1]]
s1[['Seoul', 'Daegu', 'Busan']]

City
Seoul    9904312
Daegu    2466052
Busan    3448737
Name: Population, dtype: int64

In [37]:
s1[(250e4 < s1) & (s1 < 500e4)]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

### 1.4. `Series` - Slicing
* Slicing with **integer** label $\rightarrow$ the latter is **excluded**
* Slicing with **string** label $\rightarrow$ the latter is **included**

In [38]:
s1[1:3]

City
Busan      3448737
Incheon    2890451
Name: Population, dtype: int64

In [39]:
s1["Busan":"Daegu"]

City
Busan      3448737
Incheon    2890451
Daegu      2466052
Name: Population, dtype: int64

### 1.5. `Series` - Treat as `Dictionary`

시리즈 객체는 라벨 값에 의해 인덱싱이 가능하므로 실질적으로 라벨 값을 키(key)로 가지는 딕셔너리 자료형과 같다고 볼 수 있다. 따라서 딕셔너리 자료형에서 제공하는 ``in`` 연산도 가능하고 ``items`` 메서드를 사용하면 ``for`` 루프를 통해 각 원소의 키(key)와 값(value)을 접근할 수도 있다.

In [40]:
"Seoul" in s1

True

In [41]:
for k, v in s1.items():
    print("%s = %d" % (k, v))

Seoul = 9904312
Busan = 3448737
Incheon = 2890451
Daegu = 2466052


#### Generate `Series` from `Dictionary`

In [42]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158})
s2 #dictionary object does not have order - to specify order, specify index

Busan      3393191
Daejeon    1490158
Incheon    2632035
Seoul      9631482
dtype: int64

In [43]:
s2 = pd.Series({"Seoul": 9631482, "Busan": 3393191, "Incheon": 2632035, "Daejeon": 1490158}, 
               index=["Busan", "Seoul", "Incheon", "Daejeon"])
s2

Busan      3393191
Seoul      9631482
Incheon    2632035
Daejeon    1490158
dtype: int64

### 1.6. `Series` - Operations
* Operations are applied only to the **values** (**index** is invariant)

In [34]:
s1 / 1000000

City
Seoul      9.904312
Busan      3.448737
Incheon    2.890451
Daegu      2.466052
Name: Population, dtype: float64

#### Operation between multiple `Series` $\rightarrow$ Operation applied only to the data with the same index

In [48]:
s1.values - s2.values #not something we need

array([ 6511121, -6182745,   258416,   975894])

In [47]:
ds = s1 - s2
ds

Busan       55546.0
Daegu           NaN
Daejeon         NaN
Incheon    258416.0
Seoul      272830.0
dtype: float64

In [49]:
ds.notnull()

Busan       True
Daegu      False
Daejeon    False
Incheon     True
Seoul       True
dtype: bool

In [50]:
ds[ds.notnull()]

Busan       55546.0
Incheon    258416.0
Seoul      272830.0
dtype: float64

## 2. `DataFrame` class

### 2.1. Generate  `DataFrame`

In [4]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}

columns = ["지역", "2015", "2010", "2005", "2000", "2010-2015 증가율"]

index = ["서울", "부산", "인천", "대구"]

df = pd.DataFrame(data, index=index, columns=columns)

df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Access data

In [5]:
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [6]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [7]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

#### Set names

In [8]:
df.index.name = "도시"
df.columns.name = "특성"
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


#### Transpose

In [9]:
df.T

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


### 2.2. `DataFrame` - Indexing

In [10]:
df["지역"] #Series

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

#### To preserve `DataFrame` datatype, use list

In [12]:
df[["지역"]] #DataFrame

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [11]:
df[["2010", "2015"]]

특성,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


#### Integer Indexing - cannot be used when column index has string label

In [13]:
df[0]

KeyError: 0

#### Integer Indexing - can be used when column index has integer label

In [16]:
df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
df2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [17]:
df2[2]

0     2
1     6
2    10
Name: 2, dtype: int64

#### Single Data Indexing - indexing with column label & indexing with row label

In [21]:
df["2015"]["서울"]

9904312

#### Row Indexing - use slicing

In [22]:
df[:1]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [51]:
df[1:3]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54


In [52]:
df["서울":"부산"]

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4


### 2.3. `DataFrame` - Update

In [18]:
df["2010-2015 증가율"] = df["2010-2015 증가율"] * 100
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2431774,2456016,2473990,1.41


In [19]:
df["2005-2010 증가율"] = ((df["2010"] - df["2005"]) / df["2005"] * 100).round(2)
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83,-1.34
부산,경상권,3448737,3393191,3512547,3655437,1.63,-3.4
인천,수도권,2890451,2632035,2517680,2466338,9.82,4.54
대구,경상권,2466052,2431774,2456016,2473990,1.41,-0.99


In [20]:
del df["2010-2015 증가율"]
df

특성,지역,2015,2010,2005,2000,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


### 2.4. `DataFrame` - Indexer
* Enables 2D indexing (row index, column index)
* Indexers:
  * `loc` : Label-based 2D indexing
  * `iloc` : Order-2D integer indexing
  * `at`: Label-based 2D indexing (single scalar value)
  * `iat` : Order-based 2D integer indexing (single scalar value)

#### `loc` indexer

In [55]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                  index=["a", "b", "c"],
                  columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [58]:
df.loc["a", "A"]

10

In [7]:
df.loc["a", :] #extract one row as Series
#df[:1] returns DataFrame

A    10
B    11
C    12
D    13
Name: a, dtype: int64

In [3]:
df.loc["b":, "A"] #slicing

b    14
c    18
Name: A, dtype: int64

In [5]:
df.loc[["a", "b"], ["B", "D"]]

Unnamed: 0,B,D
a,11,13
b,15,17


In [6]:
df.loc[df.A > 10, :]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [60]:
def select_rows(df): #use function
    return df.A > 12

select_rows(df)

a    False
b     True
c     True
Name: A, dtype: bool

In [61]:
df.loc[select_rows(df), ["B"]]

Unnamed: 0,B
b,15
c,19


In [62]:
# df.loc[:, df[:1] <= 11]  # Wrong
df.loc[:, df.loc["a", :] <= 11] # Right

Unnamed: 0,A,B
a,10,11
b,14,15
c,18,19


In [14]:
df.loc["e"] = [90, 91, 92, 93]
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,90,91,92,93


#### `loc` indexer - when indices are integers
* In such case, the indices are treated as order-less

In [63]:
df2 = pd.DataFrame(np.arange(10, 26).reshape(4, 4), columns=np.arange(1, 8, 2))
df2

Unnamed: 0,1,3,5,7
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [64]:
df2.loc[1, 1]

14

In [65]:
df2.loc[1:2, :]

Unnamed: 0,1,3,5,7
1,14,15,16,17
2,18,19,20,21


#### `iloc` indexer

In [18]:
df.iloc[0, 1]

11

In [19]:
df.iloc[:2, 2]

a    12
b    16
Name: C, dtype: int64

In [20]:
df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int64

In [21]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [22]:
df.iloc[-1]

A    90
B    91
C    92
D    93
Name: e, dtype: int64

In [23]:
df.iloc[-1] = df.iloc[-1] * 2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21
e,180,182,184,186


#### 연습 문제 1

1. 모든 행과 열에 라벨을 가지는 5 x 5 이상의 크기를 가지는 데이터프레임을 만든다. 
2. 10가지 이상의 방법으로 특정한 행과 열을 선택한다.


## `at`, `iat` 인덱서

`at`, `iat` 인덱서는 `loc`, `iloc` 인덱서와 비슷하지만 하나의 스칼라 값을 뽑을 때만 사용한다. 빠른 인덱싱 속도가 요구되는 경우에 사용한다.

In [24]:
%timeit df.loc["a", "A"]

8.06 µs ± 1.84 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [25]:
%timeit df.at["a", "A"]

5.91 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [26]:
%timeit df.iloc[0, 0]

8.91 µs ± 839 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [27]:
%timeit df.iat[0, 0]

5.27 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 3. Read & Write Files

### 3.1. Supported File Formats
* CSV
* Excel
* HTML
* JSON
* HDF5
* SAS
* STATA
* SQL

### 3.2. Read CSV

#### Create a CSV file using `%%writefile`

In [34]:
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample1.csv


#### Read from CSV

In [35]:
pd.read_csv('sample1.csv')

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Specify column index

In [36]:
%%writefile sample2.csv
1, 1.11, one
2, 2.22, two
3, 3.33, three

Overwriting sample2.csv


In [37]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### Choose a column & Use it as row index

In [38]:
pd.read_csv('sample1.csv', index_col='c1')

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


### 3.3. Read Text
* Use `sep` parameter
* `\s+`: blank of varying length

In [39]:
%%writefile sample3.txt
c1        c2        c3        c4 
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515

Writing sample3.txt


In [40]:
pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


#### Skip Rows

In [41]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명: 
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [42]:
pd.read_csv('sample4.txt', skiprows=[0, 1])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


#### NaN Values

In [44]:
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2,, two
N, 3.33, three

Writing sample5.csv


In [45]:
df = pd.read_csv('sample5.csv', na_values=['N'])
df

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


### 3.4 Write CSV or Text

In [53]:
df.to_csv('sample6.csv')

#### `sep` parameter can be customized

In [49]:
df.to_csv('sample7.txt', sep='|')

In [50]:
!cat sample7.txt

|c1| c2| c3
0|1.0|1.11| one
1|2.0|| two
2||3.33| three


#### Check the file content using `!cat` (use `!type` in Windows)

In [48]:
!cat sample6.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,, two
2,,3.33, three


#### NaN values

In [51]:
df.to_csv('sample8.csv', na_rep='NN')

In [52]:
!cat sample8.csv

,c1, c2, c3
0,1.0,1.11, one
1,2.0,NN, two
2,NN,3.33, three


#### `index` & `header` parameters

In [19]:
df.index = ["a", "b", "c"]
df

Unnamed: 0,c1,c2,c3
a,1.0,1.11,one
b,2.0,,two
c,,3.33,three


In [20]:
df.to_csv('sample9.csv', index=False, header=False)

In [21]:
!cat sample9.csv

1.0,1.11, one
2.0,, two
,3.33, three
