In [2]:
import numpy as np
import pandas as pd

### Series

In [2]:
# Pandas의 자료구조 중 Series
obj = pd.Series([10, 30, 20, 40])
obj

0    10
1    30
2    20
3    40
dtype: int64

In [5]:
# Series의 값만
obj.values

array([10, 30, 20, 40])

In [6]:
# Series의 인덱스만
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# 인덱스 바꾸기
obj2 = pd.Series([10, 30, 20, 40], index=['a', 'c', 'b', 'd'])
obj2

a    10
c    30
b    20
d    40
dtype: int64

In [8]:
# python의 dictionary -> Series data
# dictionary의 key -> Series data.index
sdata = {'I':1000, 'Want':3000, 'to':2000, 'go':5000, 'sleep':1500}
obj3 = pd.Series(sdata)
obj3

I        1000
Want     3000
to       2000
go       5000
sleep    1500
dtype: int64

In [9]:
obj3.name = 'Feeling'
obj3.index.name = 'Word'
obj3

Word
I        1000
Want     3000
to       2000
go       5000
sleep    1500
Name: Feeling, dtype: int64

### Data Frame

In [3]:
# Pandas의 자료구조 중 Data Frame
# Data Frame은 들어갈 데이터를 정의해야 됨
# Python의 dictionary or numpy의 array를 사용하여 정의
data = {'word': ['I', 'want', 'to', 'go', 'sleep'],
       'many': [1, 4, 2, 2, 5],
       'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,word,many,points
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [4]:
# 행 방향의 index
df.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# 열 방향의 index
df.columns

Index(['word', 'many', 'points'], dtype='object')

In [6]:
# 값 얻기
df.values

array([['I', 1, 1.5],
       ['want', 4, 1.7],
       ['to', 2, 3.6],
       ['go', 2, 2.4],
       ['sleep', 5, 2.9]], dtype=object)

In [7]:
# 각 인덱스에 대한 이름 설정
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,many,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [8]:
# DataFrame을 만들면서 columns, index 설정 가능
df2 = pd.DataFrame(data, columns=['word', 'many', 'points', 'penalty'],
                  index=['0', '1', '2', '3', '4'])
df2

Unnamed: 0,word,many,points,penalty
0,I,1,1.5,
1,want,4,1.7,
2,to,2,3.6,
3,go,2,2.4,
4,sleep,5,2.9,


In [9]:
# describe() 함수는 DataFrame의 다양한 계산 값 도출
df2.describe()

Unnamed: 0,many,points
count,5.0,5.0
mean,2.8,2.42
std,1.643168,0.864292
min,1.0,1.5
25%,2.0,1.7
50%,2.0,2.4
75%,4.0,2.9
max,5.0,3.6


### DataFrame Indexing

In [10]:
# data 추가
data = {"word": ["I", "want", "to", "go", "sleep"],
        "many": [1, 4, 2, 2, 5],
       "points": [3.2, 3.4, 2.1, 5.2, 3.3]}
df = pd.DataFrame(data, columns=["word", "points", "many", "nanTest"],
                 index=[0, 1, 2, 3, 4])
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,
1,want,3.4,4,
2,to,2.1,2,
3,go,5.2,2,
4,sleep,3.3,5,


In [11]:
# 열 선택, 조작
df['many']

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [12]:
# 열 선택, 조작
df.many

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [13]:
df[['word', 'many']]

Unnamed: 0,word,many
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [14]:
# 특정 열 선택하고 대입 가능
df['nanTest'] = 10

In [15]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,10
2,to,2.1,2,10
3,go,5.2,2,10
4,sleep,3.3,5,10


In [16]:
# 특정 열에 각각으로 대입
df['nanTest'] = [10, 30, 20, 40, 50] 

In [17]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,30
2,to,2.1,2,20
3,go,5.2,2,40
4,sleep,3.3,5,50


In [18]:
df['more'] = np.arange(5)

In [19]:
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,0
1,want,3.4,4,30,1
2,to,2.1,2,20,2
3,go,5.2,2,40,3
4,sleep,3.3,5,50,4


In [20]:
# Series도 추가 가능
add = pd.Series([1, 2, 0], index=[0, 1, 2])

In [21]:
df['more'] = add

In [22]:
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,1.0
1,want,3.4,4,30,2.0
2,to,2.1,2,20,0.0
3,go,5.2,2,40,
4,sleep,3.3,5,50,


In [23]:
moreAdd = pd.Series([2, 1.4, 2.3, 2, 1], index=[0, 1, 2, 3, 4])
df['more'] = moreAdd
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,2.0
1,want,3.4,4,30,1.4
2,to,2.1,2,20,2.3
3,go,5.2,2,40,2.0
4,sleep,3.3,5,50,1.0


In [24]:
# 새로운 열을 만들 때 다른 열과 관계를 만들 수 있음
df['new_point'] = df['points'] - df['more']
df['high_point'] = df['new_point'] > df['more']
df

Unnamed: 0,word,points,many,nanTest,more,new_point,high_point
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True
4,sleep,3.3,5,50,1.0,2.3,True


In [43]:
#열 삭제
del df['nanTest']
df

Unnamed: 0,word,points,many,more
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0
4,sleep,3.3,5,1.0


In [26]:
df.columns

Index(['word', 'points', 'many', 'nanTest', 'more', 'new_point', 'high_point'], dtype='object')

In [27]:
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True
4,sleep,3.3,5,50,1.0,2.3,True


### DataFrame에서 행 선택, 조작

In [28]:
# [::] 슬라이싱 사용
df[0:3]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False


In [29]:
# 하지만 loc, iloc 함수를 더 사용
df.loc[3] # 반환 형태는 Series

Info
word            go
points         5.2
many             2
nanTest         40
more           2.0
new_point      3.2
high_point    True
Name: 3, dtype: object

In [30]:
df.loc[:3]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True


In [31]:
df.loc[:3,'points']

Num
0    3.2
1    3.4
2    2.1
3    5.2
Name: points, dtype: float64

In [32]:
df.loc[:,'word']

Num
0        I
1     want
2       to
3       go
4    sleep
Name: word, dtype: object

In [33]:
df.loc[:,['word', 'many']] #전체 열과 두 행 출력

Info,word,many
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [34]:
df.loc[1:4, "word":"more"]

Info,word,points,many,nanTest,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,want,3.4,4,30,1.4
2,to,2.1,2,20,2.3
3,go,5.2,2,40,2.0
4,sleep,3.3,5,50,1.0


In [None]:
# 새로운 행 삽입
df.loc['5',:] = ['!!', 4.2, 3, 3.2]
df

In [37]:
# index 번호를 사용하는 ilc 함수
df.iloc[3]

Info
word            go
points         5.2
many           2.0
nanTest       40.0
more           2.0
new_point      3.2
high_point    True
Name: 3, dtype: object

In [38]:
df.iloc[[0, 3, 5], [1, 2, 0]]

Info,points,many,word
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.2,1.0,I
3,5.2,2.0,go
5,,,


In [39]:
df.iloc[:, [2, 3]]

Info,many,nanTest
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,10.0
1,4.0,30.0
2,2.0,20.0
3,2.0,40.0
4,5.0,50.0
5,,


In [40]:
df.iloc[1, 3]

30.0

### DataFrame의 Boolean Indexing

In [41]:
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1.0,10.0,2.0,1.2,False
1,want,3.4,4.0,30.0,1.4,2.0,True
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True
4,sleep,3.3,5.0,50.0,1.0,2.3,True
5,,,,,,,


In [42]:
# nanTest와 비교하여 boolean data
df['nanTest'] > 35

Num
0    False
1    False
2    False
3     True
4     True
5    False
Name: nanTest, dtype: bool

In [43]:
# nanTest와 비교하여 True인 모든 행의 값
df.loc[df['nanTest'] > 35, :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,go,5.2,2.0,40.0,2.0,3.2,True
4,sleep,3.3,5.0,50.0,1.0,2.3,True


In [45]:
df.loc[df['many'] == 2, :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True


In [46]:
# 논리 연산도 가능
df.loc[(df['many'] > 3) & (df['points'] < 3), :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [47]:
# 새로운 값 대입
df.loc[df['word'] == 'sleep', 'word'] = 'home'

In [48]:
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1.0,10.0,2.0,1.2,False
1,want,3.4,4.0,30.0,1.4,2.0,True
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True
4,home,3.3,5.0,50.0,1.0,2.3,True
5,,,,,,,


### Data

In [49]:
# DataFrame을 만들 때 index, column을 설정하지 않을 때 0부터 시작하는 정수형 숫자로 입력
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,1.540378,2.107857,0.015991,0.266569
1,-0.350205,0.011101,-0.615892,-0.978691
2,-1.5257,0.201026,-0.59293,-0.066056
3,1.916667,2.284871,0.104085,1.146015
4,0.071847,-1.286016,-1.612054,-0.506494
5,-0.387793,1.197102,0.784722,0.292135


In [50]:
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20211124', periods=6)
# pandas의 함수 date_range는 datetime 자료형으로 구성된 날짜, 시각 등을 알 수 있는 자료형을 만드는 함수
df.index

DatetimeIndex(['2021-11-24', '2021-11-25', '2021-11-26', '2021-11-27',
               '2021-11-28', '2021-11-29'],
              dtype='datetime64[ns]', freq='D')

In [51]:
df

Unnamed: 0,A,B,C,D
2021-11-24,1.540378,2.107857,0.015991,0.266569
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056
2021-11-27,1.916667,2.284871,0.104085,1.146015
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494
2021-11-29,-0.387793,1.197102,0.784722,0.292135


In [52]:
# np.nan은 NaN값을 의미
df['F'] = [1.0, np.nan, 3.5, 4.2, np.nan, 2.8]
df

Unnamed: 0,A,B,C,D,F
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8
