In [2]:
import numpy as np
import pandas as pd

### Series

In [2]:
# Pandas의 자료구조 중 Series
obj = pd.Series([10, 30, 20, 40])
obj

0    10
1    30
2    20
3    40
dtype: int64

In [5]:
# Series의 값만
obj.values

array([10, 30, 20, 40])

In [6]:
# Series의 인덱스만
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# 인덱스 바꾸기
obj2 = pd.Series([10, 30, 20, 40], index=['a', 'c', 'b', 'd'])
obj2

a    10
c    30
b    20
d    40
dtype: int64

In [8]:
# python의 dictionary -> Series data
# dictionary의 key -> Series data.index
sdata = {'I':1000, 'Want':3000, 'to':2000, 'go':5000, 'sleep':1500}
obj3 = pd.Series(sdata)
obj3

I        1000
Want     3000
to       2000
go       5000
sleep    1500
dtype: int64

In [9]:
obj3.name = 'Feeling'
obj3.index.name = 'Word'
obj3

Word
I        1000
Want     3000
to       2000
go       5000
sleep    1500
Name: Feeling, dtype: int64

### Data Frame

In [3]:
# Pandas의 자료구조 중 Data Frame
# Data Frame은 들어갈 데이터를 정의해야 됨
# Python의 dictionary or numpy의 array를 사용하여 정의
data = {'word': ['I', 'want', 'to', 'go', 'sleep'],
       'many': [1, 4, 2, 2, 5],
       'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,word,many,points
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [4]:
# 행 방향의 index
df.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# 열 방향의 index
df.columns

Index(['word', 'many', 'points'], dtype='object')

In [6]:
# 값 얻기
df.values

array([['I', 1, 1.5],
       ['want', 4, 1.7],
       ['to', 2, 3.6],
       ['go', 2, 2.4],
       ['sleep', 5, 2.9]], dtype=object)

In [7]:
# 각 인덱스에 대한 이름 설정
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,many,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [8]:
# DataFrame을 만들면서 columns, index 설정 가능
df2 = pd.DataFrame(data, columns=['word', 'many', 'points', 'penalty'],
                  index=['0', '1', '2', '3', '4'])
df2

Unnamed: 0,word,many,points,penalty
0,I,1,1.5,
1,want,4,1.7,
2,to,2,3.6,
3,go,2,2.4,
4,sleep,5,2.9,


In [9]:
# describe() 함수는 DataFrame의 다양한 계산 값 도출
df2.describe()

Unnamed: 0,many,points
count,5.0,5.0
mean,2.8,2.42
std,1.643168,0.864292
min,1.0,1.5
25%,2.0,1.7
50%,2.0,2.4
75%,4.0,2.9
max,5.0,3.6


### DataFrame Indexing

In [11]:
# data 추가
data = {"word": ["I", "want", "to", "go", "sleep"],
        "many": [1, 4, 2, 2, 5],
       "points": [3.2, 3.4, 2.1, 5.2, 3.3]}
df = pd.DataFrame(data, columns=["word", "points", "many", "nanTest"],
                 index=[0, 1, 2, 3, 4])
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,
1,want,3.4,4,
2,to,2.1,2,
3,go,5.2,2,
4,sleep,3.3,5,


In [13]:
# 열 선택, 조작
df['many']

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [15]:
# 열 선택, 조작
df.many

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [17]:
df[['word', 'many']]

Unnamed: 0,word,many
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [18]:
# 특정 열 선택하고 대입 가능
df['nanTest'] = 10

In [19]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,10
2,to,2.1,2,10
3,go,5.2,2,10
4,sleep,3.3,5,10


In [20]:
# 특정 열에 각각으로 대입
df['nanTest'] = [10, 30, 20, 40, 50] 

In [21]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,30
2,to,2.1,2,20
3,go,5.2,2,40
4,sleep,3.3,5,50


In [30]:
df['more'] = np.arange(5)

In [23]:
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,0
1,want,3.4,4,30,1
2,to,2.1,2,20,2
3,go,5.2,2,40,3
4,sleep,3.3,5,50,4


In [24]:
# Series도 추가 가능
add = pd.Series([1, 2, 0], index=[0, 1, 2])

In [28]:
df['more'] = add

In [31]:
df

Unnamed: 0,word,points,many,nanTest,more,add
0,I,3.2,1,10,0,1.0
1,want,3.4,4,30,1,2.0
2,to,2.1,2,20,2,0.0
3,go,5.2,2,40,3,
4,sleep,3.3,5,50,4,


In [34]:
moreAdd = pd.Series([2, 1.4, 2.3, 2, 1], index=[0, 1, 2, 3, 4])
df['more'] = moreAdd
df

Unnamed: 0,word,points,many,nanTest,more,add
0,I,3.2,1,10,2.0,1.0
1,want,3.4,4,30,1.4,2.0
2,to,2.1,2,20,2.3,0.0
3,go,5.2,2,40,2.0,
4,sleep,3.3,5,50,1.0,


In [37]:
# 새로운 열을 만들 때 다른 열과 관계를 만들 수 있음
df['new_point'] = df['points'] - df['more']
df['high_point'] = df['new_point'] > df['more']
df

Unnamed: 0,word,points,many,nanTest,more,add,new_point,high_point
0,I,3.2,1,10,2.0,1.0,1.2,False
1,want,3.4,4,30,1.4,2.0,2.0,True
2,to,2.1,2,20,2.3,0.0,-0.2,False
3,go,5.2,2,40,2.0,,3.2,True
4,sleep,3.3,5,50,1.0,,2.3,True


In [38]:
# 열 삭제
del df['add']
df

Unnamed: 0,word,points,many,nanTest,more,new_point,high_point
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True
4,sleep,3.3,5,50,1.0,2.3,True


In [43]:
del df['nanTest']
df

Unnamed: 0,word,points,many,more
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0
4,sleep,3.3,5,1.0


In [44]:
df.columns

Index(['word', 'points', 'many', 'more'], dtype='object')

In [45]:
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,points,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0
4,sleep,3.3,5,1.0


### DataFrame에서 행 선택, 조작

In [46]:
# [::] 슬라이싱 사용
df[0:3]

Info,word,points,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3


In [47]:
# 하지만 loc, iloc 함수를 더 사용
df.loc[3] # 반환 형태는 Series

Info
word       go
points    5.2
many        2
more      2.0
Name: 3, dtype: object

In [48]:
df.loc[:3]

Info,word,points,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0


In [49]:
df.loc[:3,'points']

Num
0    3.2
1    3.4
2    2.1
3    5.2
Name: points, dtype: float64

In [50]:
df.loc[:,'word']

Num
0        I
1     want
2       to
3       go
4    sleep
Name: word, dtype: object

In [51]:
df.loc[:,['word', 'many']] #전체 열과 두 행 출력

Info,word,many
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [52]:
df.loc[1:4, "word":"more"]

Info,word,points,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0
4,sleep,3.3,5,1.0


In [53]:
# 새로운 행 삽입
df.loc['5',:] = ['!!', 4.2, 3, 3.2]
df

Info,word,points,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,I,3.2,1.0,2.0
1,want,3.4,4.0,1.4
2,to,2.1,2.0,2.3
3,go,5.2,2.0,2.0
4,sleep,3.3,5.0,1.0
5,!!,4.2,3.0,3.2


In [54]:
# index 번호를 사용하는 ilc 함수
df.iloc[3]

Info
word       go
points    5.2
many      2.0
more      2.0
Name: 3, dtype: object

In [57]:
df.iloc[[0, 3, 5], [1, 2, 0]]

Info,points,many,word
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.2,1.0,I
3,5.2,2.0,go
5,4.2,3.0,!!


In [58]:
df.iloc[:, [2, 3]]

Info,many,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,2.0
1,4.0,1.4
2,2.0,2.3
3,2.0,2.0
4,5.0,1.0
5,3.0,3.2


In [59]:
df.iloc[1, 3]

1.4

### DataFrame의 Boolean Indexing