# Pandas

https://pandas.pydata.org

- Pandas는 파이썬을 이용한 데이터 분석를 수행하는데 사용되는 가장 대표적인 라이브러리이다.
- R의 DataFrame을 파이썬 버전으로 만든 라이브러리라 생각해도 된다.
- Pandas는 NumPy를 기반으로 작성된 라이브러리이다.
- Pandas는 NumPy의 ndarray객체를 이용한 배열처럼 동일 자료형만으로 1/2차원 배열을 생성하는 것이 아니고 다양한 자료형을 저장할 수 있는 1/2차원 배열을 제공한다.
- Pandas에서 1차원 배열은 Series 객체를 이용하여 생성하고, 2차원 배열은 DataFrame을 이용하여 생성한다.

#### Pandas import

In [1]:
import numpy as np
import pandas as pd

### Series 객체   

- 1차원 형태의 배열을 생성하는 객체

#### Series 객체 생성

In [69]:
s = pd.Series( [ 10, 20, 30, 40, 50 ] )
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

#### Series 객체 기본 속성

In [70]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [71]:
s.values

array([10, 20, 30, 40, 50], dtype=int64)

In [72]:
s = pd.Series( [ 'a', 'b', 'c', 1, 2, 3 ] )
s

0    a
1    b
2    c
3    1
4    2
5    3
dtype: object

In [73]:
s.index

RangeIndex(start=0, stop=6, step=1)

In [74]:
s.values

array(['a', 'b', 'c', 1, 2, 3], dtype=object)

In [80]:
value = [ 1, 2, 3, 4, 5 ]
index = [ '짱', '둘짱', '하나졸개', '둘졸개', '셋졸개' ]
s = pd.Series( value, index = index )
s

짱       1
둘짱      2
하나졸개    3
둘졸개     4
셋졸개     5
dtype: int64

In [81]:
s.index

Index(['짱', '둘짱', '하나졸개', '둘졸개', '셋졸개'], dtype='object')

In [82]:
s.values

array([1, 2, 3, 4, 5], dtype=int64)

In [88]:
d = { '국어': 50, '영어': 50, '수학': 50 }
s = pd.Series( d )
s

국어    50
영어    50
수학    50
dtype: int64

In [89]:
s.index

Index(['국어', '영어', '수학'], dtype='object')

In [90]:
s.values

array([50, 50, 50], dtype=int64)

In [91]:
s = pd.Series( [ np.nan, 10, 30 ] )
s

0     NaN
1    10.0
2    30.0
dtype: float64

In [92]:
s. index

RangeIndex(start=0, stop=3, step=1)

In [93]:
s.values

array([nan, 10., 30.])

In [94]:
index_date = [ '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04' ]
values = [ 200, 195, np.nan, 205 ]
s = pd.Series( values, index = index_date )
s

2020-06-01    200.0
2020-06-02    195.0
2020-06-03      NaN
2020-06-04    205.0
dtype: float64

In [95]:
s.index

Index(['2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04'], dtype='object')

In [96]:
s.values

array([200., 195.,  nan, 205.])

#### 날짜 자동 생성

In [98]:
pd.date_range( start = '2020-05-12', end = '2020-08-03' )

DatetimeIndex(['2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15',
               '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19',
               '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23',
               '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27',
               '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31',
               '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04',
               '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08',
               '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12',
               '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16',
               '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20',
               '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24',
               '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02',
               '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06',
      

In [99]:
pd.date_range( start = '2020/05/12', end = '2020.08.03' )

DatetimeIndex(['2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15',
               '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19',
               '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23',
               '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27',
               '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31',
               '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04',
               '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08',
               '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12',
               '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16',
               '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20',
               '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24',
               '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02',
               '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06',
      

In [100]:
pd.date_range( start = '05-12-2020', end = '08/03/2020' )

DatetimeIndex(['2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15',
               '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19',
               '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23',
               '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27',
               '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31',
               '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04',
               '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08',
               '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12',
               '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16',
               '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20',
               '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24',
               '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02',
               '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06',
      

In [101]:
pd.date_range( start = '2020-05-12', end = '08.03.2020' )

DatetimeIndex(['2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15',
               '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19',
               '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23',
               '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27',
               '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31',
               '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04',
               '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08',
               '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12',
               '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16',
               '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20',
               '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24',
               '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02',
               '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06',
      

In [102]:
pd.date_range( start = '2020-06-15', periods = 7 )

DatetimeIndex(['2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18',
               '2020-06-19', '2020-06-20', '2020-06-21'],
              dtype='datetime64[ns]', freq='D')

In [103]:
pd.date_range( start = '2020-06-15', periods = 4, freq = '2D' )

DatetimeIndex(['2020-06-15', '2020-06-17', '2020-06-19', '2020-06-21'], dtype='datetime64[ns]', freq='2D')

In [104]:
pd.date_range( start = '2020-06-14', periods = 4, freq = 'W' )

DatetimeIndex(['2020-06-14', '2020-06-21', '2020-06-28', '2020-07-05'], dtype='datetime64[ns]', freq='W-SUN')

In [105]:
pd.date_range( start = '2020-01-01', periods = 12, freq = '2BM' )

DatetimeIndex(['2020-01-31', '2020-03-31', '2020-05-29', '2020-07-31',
               '2020-09-30', '2020-11-30', '2021-01-29', '2021-03-31',
               '2021-05-31', '2021-07-30', '2021-09-30', '2021-11-30'],
              dtype='datetime64[ns]', freq='2BM')

In [106]:
pd.date_range( start = '2020-01-01', periods = 4, freq = 'QS' )

DatetimeIndex(['2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01'], dtype='datetime64[ns]', freq='QS-JAN')

In [107]:
pd.date_range( start = '2020-06-15', periods = 3, freq = 'AS' )

DatetimeIndex(['2021-01-01', '2022-01-01', '2023-01-01'], dtype='datetime64[ns]', freq='AS-JAN')

In [108]:
pd.date_range( start = '2020-01-01 08:00', periods = 10, freq = 'H' )

DatetimeIndex(['2020-01-01 08:00:00', '2020-01-01 09:00:00',
               '2020-01-01 10:00:00', '2020-01-01 11:00:00',
               '2020-01-01 12:00:00', '2020-01-01 13:00:00',
               '2020-01-01 14:00:00', '2020-01-01 15:00:00',
               '2020-01-01 16:00:00', '2020-01-01 17:00:00'],
              dtype='datetime64[ns]', freq='H')

In [109]:
pd.date_range( start = '2020-01-01 08:00', periods = 10, freq = 'BH' )

DatetimeIndex(['2020-01-01 09:00:00', '2020-01-01 10:00:00',
               '2020-01-01 11:00:00', '2020-01-01 12:00:00',
               '2020-01-01 13:00:00', '2020-01-01 14:00:00',
               '2020-01-01 15:00:00', '2020-01-01 16:00:00',
               '2020-01-02 09:00:00', '2020-01-02 10:00:00'],
              dtype='datetime64[ns]', freq='BH')

In [110]:
pd.date_range( start = '2020-01-01 10:00', periods = 4, freq = '30min' )

DatetimeIndex(['2020-01-01 10:00:00', '2020-01-01 10:30:00',
               '2020-01-01 11:00:00', '2020-01-01 11:30:00'],
              dtype='datetime64[ns]', freq='30T')

In [111]:
pd.date_range( start = '2020-01-01 08:00', periods = 4, freq = '30T' )

DatetimeIndex(['2020-01-01 08:00:00', '2020-01-01 08:30:00',
               '2020-01-01 09:00:00', '2020-01-01 09:30:00'],
              dtype='datetime64[ns]', freq='30T')

In [112]:
pd.date_range( start = '2020-01-01 08:00:00', periods = 4, freq = '10S' )

DatetimeIndex(['2020-01-01 08:00:00', '2020-01-01 08:00:10',
               '2020-01-01 08:00:20', '2020-01-01 08:00:30'],
              dtype='datetime64[ns]', freq='10S')

In [113]:
index_date = pd.date_range( start = '2020-06-15', periods = 5, freq = 'D' )
s = pd.Series( np.array( [ 1, 1, 1, 2, 1 ] ), index = index_date )
s

2020-06-15    1
2020-06-16    1
2020-06-17    1
2020-06-18    2
2020-06-19    1
Freq: D, dtype: int32

In [114]:
s.index

DatetimeIndex(['2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18',
               '2020-06-19'],
              dtype='datetime64[ns]', freq='D')

In [115]:
s.values

array([1, 1, 1, 2, 1])

### Series 객체 요소 지정

In [116]:
values = [ 10, 20, 30, 40, 50, 60 ]
indexs = [ 'a', 'b', 'c', 'd', 'e', 'f' ]
s = pd.Series( values, index = indexs )
s

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [117]:
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [118]:
s.values

array([10, 20, 30, 40, 50, 60], dtype=int64)

In [119]:
s[ 0 ]

10

In [120]:
s[ 'a' ]

10

In [121]:
s[ [ 0, 2 ] ]

a    10
c    30
dtype: int64

In [122]:
s[ [ 'a', 'c' ] ]

a    10
c    30
dtype: int64

In [123]:
s[ 1:3 ]

b    20
c    30
dtype: int64

In [124]:
s[ 'b':'d' ]

b    20
c    30
d    40
dtype: int64

## DataFrame   

- DataFrame은 2차원 배열 구조이다.
- DataFrame은 Serice객체가 하나의 열을 구성하는 형태이다.
- R의 DataFrame과 같은 구조를 갖는다.

### DataFrame 생성

In [126]:
d = pd.DataFrame( [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 7, 8, 9 ] ] )
d

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [127]:
d.index

RangeIndex(start=0, stop=3, step=1)

In [128]:
d.columns

RangeIndex(start=0, stop=3, step=1)

In [129]:
d.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [130]:
values = np.array( [ [ 10, 20, 30 ], [ 40, 50, 60 ], [ 70, 80, 90 ] ] )
columns = [ 'A', 'B', 'C' ]
d = pd.DataFrame( values, columns = columns )
d

Unnamed: 0,A,B,C
0,10,20,30
1,40,50,60
2,70,80,90


In [132]:
d.index

RangeIndex(start=0, stop=3, step=1)

In [131]:
d.columns

Index(['A', 'B', 'C'], dtype='object')

In [133]:
d.values

array([[10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [134]:
d_list = { '이름' : [ 'hong', 'kim', 'lee' ],
           '국어' : [ 50, 90, 70 ],
           '영어' : [ 50, 90, 70 ],
           '수학' : [ 50, 90, 70 ] }
d = pd.DataFrame( d_list )
d

Unnamed: 0,이름,국어,영어,수학
0,hong,50,50,50
1,kim,90,90,90
2,lee,70,70,70


In [135]:
d.index

RangeIndex(start=0, stop=3, step=1)

In [136]:
d.columns

Index(['이름', '국어', '영어', '수학'], dtype='object')

In [137]:
d.values

array([['hong', 50, 50, 50],
       ['kim', 90, 90, 90],
       ['lee', 70, 70, 70]], dtype=object)

#### Series 객체를 이용한 DataFrame 생성

In [5]:
s1 = pd.Series(
    [4000,20000,10000], 
    index = ['2020-03-03', '2020-03-04', '2020-03-05'], 
    name='apple')
s2 = pd.Series(
    [5000,8000,11000], 
    index = ['2020-03-03', '2020-03-04', '2020-03-05'], 
    name='bannana')
s3 = pd.Series(
    [6000,9000,12000], 
    index = ['2020-03-03', '2020-03-04', '2020-03-05'], 
    name='cherry')
df = pd.DataFrame([s1,s2,s3])
df

Unnamed: 0,2020-03-03,2020-03-04,2020-03-05
apple,4000,20000,10000
bannana,5000,8000,11000
cherry,6000,9000,12000


In [9]:
hong = pd.Series( [ 'hong', 50, 50, 50 ], index = [ 'name', 'kor', 'eng', 'math' ] )
kim = pd.Series( [ 'kim', 90, 90, 90 ], index = [ 'name', 'kor', 'eng', 'math' ]  )
lee = pd.Series( [ 'lee', 70, 70, 70 ], index = [ 'name', 'kor', 'eng', 'math' ]  )
d = pd.DataFrame( [ hong, kim, lee ], index = [ 1, 2, 3 ] )
d

Unnamed: 0,name,kor,eng,math
1,hong,50,50,50
2,kim,90,90,90
3,lee,70,70,70


In [2]:
data = np.array( [ [ 10, 20, 30, 40 ], [ 100, 200, 300, 400 ] ] )
index_list = pd.date_range( '2020-06-15', periods = 2 )
col_list = [ 'A', 'B', 'C', 'D' ]
d = pd.DataFrame( data, index = index_list, columns = col_list )
d

Unnamed: 0,A,B,C,D
2020-06-15,10,20,30,40
2020-06-16,100,200,300,400


In [3]:
d.index

DatetimeIndex(['2020-06-15', '2020-06-16'], dtype='datetime64[ns]', freq='D')

In [4]:
d.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [5]:
d.values

array([[ 10,  20,  30,  40],
       [100, 200, 300, 400]])

#### DataFrame의 행 인덱스 또는 열 이름 일부 변경   

- rename() 메서드 사용
- 원본 객체 변경시에는 inplace = True 옵션 사용

In [48]:
df = pd.DataFrame( [ [ 50, 50, 50 ], [ 90, 90, 90 ], [ 70, 70, 70 ] ],
                   index = [ 'hong', 'kim', 'lee' ],
                   columns = [ 'kor', 'eng', 'sci' ] )
df

Unnamed: 0,kor,eng,sci
hong,50,50,50
kim,90,90,90
lee,70,70,70


In [49]:
df2 = df[ : ] # 복사
df2.rename( columns = { 'kor': 'subject1', 'eng': 'subject2', 'sci': 'subject3' }, inplace = True )
df2

Unnamed: 0,subject1,subject2,subject3
hong,50,50,50
kim,90,90,90
lee,70,70,70


#### 행/열 삭제    

- DataFrame의 행/열 삭제는 drop() 메서드 사용
- 행을 삭제할 때는 축( axis ) 옵션으로 axis = 0를 입력하거나, 별도로 입력하지 않는다.
- 축( axis ) 옵션으로 axis = 1은 열을 삭제
- 동시에 여러 개의 행 또는 열을 삭제하려면 리스트 형태로 입력
- 원본 객체 변경시에는 inplace = True 옵션 사용

In [50]:
df2.drop( 'lee', inplace = True )
df2

Unnamed: 0,subject1,subject2,subject3
hong,50,50,50
kim,90,90,90


In [51]:
df2.drop( 'subject3', axis = 1, inplace = True )
df2

Unnamed: 0,subject1,subject2
hong,50,50
kim,90,90


#### 행 선택   

- DataFrame의 행 데이터를 선택하기 위해서는 loc와 iloc 인덱서를 사용
- 인덱스 이름을 기준으로 행을 선택할 때는 loc사용
- 정수형 위치 인덱스를 사용할 때는 iloc사용

In [52]:
df2 = df[ : ] # 복사
df2

Unnamed: 0,kor,eng,sci
hong,50,50,50
kim,90,90,90
lee,70,70,70


In [53]:
df2.iloc[ 0 ]

kor    50
eng    50
sci    50
Name: hong, dtype: int64

In [54]:
df2.loc[ 'hong' ]

kor    50
eng    50
sci    50
Name: hong, dtype: int64

In [29]:
df2.iloc[ [ 0, 2 ] ]

Unnamed: 0,kor,eng,sci
hong,50,50,50
lee,70,70,70


In [55]:
df2.loc[ [ 'hong', 'lee' ] ]

Unnamed: 0,kor,eng,sci
hong,50,50,50
lee,70,70,70


In [56]:
df2.iloc[ 0:2 ]

Unnamed: 0,kor,eng,sci
hong,50,50,50
kim,90,90,90


In [57]:
df2.loc[ 'hong':'kim' ]

Unnamed: 0,kor,eng,sci
hong,50,50,50
kim,90,90,90


#### 열선택   

- DataFrame의 열 데이터를 한 개만 선택할 때는, []안에 열이름을 '/"와 함께 입력하거나 도트( . ) 다음에 열 이름을 입력하는 두 가지 방식이 있다.
- 도트( . )를 사용하는 방식에서는 반드시 열 이름이 문자열일 경우만 가능
- 열 한 개만 선택하면 시리즈 객체가 반환된다.
- []안에 열 이름의 리스트를 입력하면 리스트의 원소인 열을 모두 선택하여 DataFrame으로 반환한다.

In [62]:
df2[ 'kor' ] # Series 출력

hong    50
kim     90
lee     70
Name: kor, dtype: int64

In [63]:
df2.kor # Series 출력

hong    50
kim     90
lee     70
Name: kor, dtype: int64

In [64]:
df2[ [ 'kor', 'sci' ] ] # DataFrame 출력

Unnamed: 0,kor,sci
hong,50,50
kim,90,90
lee,70,70


In [65]:
df2[ [ 'sci' ] ] # DataFrame 출력

Unnamed: 0,sci
hong,50
kim,90
lee,70
