In [2]:
import numpy as np
import pandas as pd

### Series

In [2]:
# Pandas의 자료구조 중 Series
obj = pd.Series([10, 30, 20, 40])
obj

0    10
1    30
2    20
3    40
dtype: int64

In [5]:
# Series의 값만
obj.values

array([10, 30, 20, 40])

In [6]:
# Series의 인덱스만
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# 인덱스 바꾸기
obj2 = pd.Series([10, 30, 20, 40], index=['a', 'c', 'b', 'd'])
obj2

a    10
c    30
b    20
d    40
dtype: int64

In [8]:
# python의 dictionary -> Series data
# dictionary의 key -> Series data.index
sdata = {'I':1000, 'Want':3000, 'to':2000, 'go':5000, 'sleep':1500}
obj3 = pd.Series(sdata)
obj3

I        1000
Want     3000
to       2000
go       5000
sleep    1500
dtype: int64

In [9]:
obj3.name = 'Feeling'
obj3.index.name = 'Word'
obj3

Word
I        1000
Want     3000
to       2000
go       5000
sleep    1500
Name: Feeling, dtype: int64

### Data Frame

In [3]:
# Pandas의 자료구조 중 Data Frame
# Data Frame은 들어갈 데이터를 정의해야 됨
# Python의 dictionary or numpy의 array를 사용하여 정의
data = {'word': ['I', 'want', 'to', 'go', 'sleep'],
       'many': [1, 4, 2, 2, 5],
       'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,word,many,points
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [4]:
# 행 방향의 index
df.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# 열 방향의 index
df.columns

Index(['word', 'many', 'points'], dtype='object')

In [6]:
# 값 얻기
df.values

array([['I', 1, 1.5],
       ['want', 4, 1.7],
       ['to', 2, 3.6],
       ['go', 2, 2.4],
       ['sleep', 5, 2.9]], dtype=object)

In [7]:
# 각 인덱스에 대한 이름 설정
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,many,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,I,1,1.5
1,want,4,1.7
2,to,2,3.6
3,go,2,2.4
4,sleep,5,2.9


In [8]:
# DataFrame을 만들면서 columns, index 설정 가능
df2 = pd.DataFrame(data, columns=['word', 'many', 'points', 'penalty'],
                  index=['0', '1', '2', '3', '4'])
df2

Unnamed: 0,word,many,points,penalty
0,I,1,1.5,
1,want,4,1.7,
2,to,2,3.6,
3,go,2,2.4,
4,sleep,5,2.9,


In [9]:
# describe() 함수는 DataFrame의 다양한 계산 값 도출
df2.describe()

Unnamed: 0,many,points
count,5.0,5.0
mean,2.8,2.42
std,1.643168,0.864292
min,1.0,1.5
25%,2.0,1.7
50%,2.0,2.4
75%,4.0,2.9
max,5.0,3.6


### DataFrame Indexing

In [10]:
# data 추가
data = {"word": ["I", "want", "to", "go", "sleep"],
        "many": [1, 4, 2, 2, 5],
       "points": [3.2, 3.4, 2.1, 5.2, 3.3]}
df = pd.DataFrame(data, columns=["word", "points", "many", "nanTest"],
                 index=[0, 1, 2, 3, 4])
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,
1,want,3.4,4,
2,to,2.1,2,
3,go,5.2,2,
4,sleep,3.3,5,


In [11]:
# 열 선택, 조작
df['many']

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [12]:
# 열 선택, 조작
df.many

0    1
1    4
2    2
3    2
4    5
Name: many, dtype: int64

In [13]:
df[['word', 'many']]

Unnamed: 0,word,many
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [14]:
# 특정 열 선택하고 대입 가능
df['nanTest'] = 10

In [15]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,10
2,to,2.1,2,10
3,go,5.2,2,10
4,sleep,3.3,5,10


In [16]:
# 특정 열에 각각으로 대입
df['nanTest'] = [10, 30, 20, 40, 50] 

In [17]:
df

Unnamed: 0,word,points,many,nanTest
0,I,3.2,1,10
1,want,3.4,4,30
2,to,2.1,2,20
3,go,5.2,2,40
4,sleep,3.3,5,50


In [18]:
df['more'] = np.arange(5)

In [19]:
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,0
1,want,3.4,4,30,1
2,to,2.1,2,20,2
3,go,5.2,2,40,3
4,sleep,3.3,5,50,4


In [20]:
# Series도 추가 가능
add = pd.Series([1, 2, 0], index=[0, 1, 2])

In [21]:
df['more'] = add

In [22]:
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,1.0
1,want,3.4,4,30,2.0
2,to,2.1,2,20,0.0
3,go,5.2,2,40,
4,sleep,3.3,5,50,


In [23]:
moreAdd = pd.Series([2, 1.4, 2.3, 2, 1], index=[0, 1, 2, 3, 4])
df['more'] = moreAdd
df

Unnamed: 0,word,points,many,nanTest,more
0,I,3.2,1,10,2.0
1,want,3.4,4,30,1.4
2,to,2.1,2,20,2.3
3,go,5.2,2,40,2.0
4,sleep,3.3,5,50,1.0


In [24]:
# 새로운 열을 만들 때 다른 열과 관계를 만들 수 있음
df['new_point'] = df['points'] - df['more']
df['high_point'] = df['new_point'] > df['more']
df

Unnamed: 0,word,points,many,nanTest,more,new_point,high_point
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True
4,sleep,3.3,5,50,1.0,2.3,True


In [43]:
#열 삭제
del df['nanTest']
df

Unnamed: 0,word,points,many,more
0,I,3.2,1,2.0
1,want,3.4,4,1.4
2,to,2.1,2,2.3
3,go,5.2,2,2.0
4,sleep,3.3,5,1.0


In [26]:
df.columns

Index(['word', 'points', 'many', 'nanTest', 'more', 'new_point', 'high_point'], dtype='object')

In [27]:
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True
4,sleep,3.3,5,50,1.0,2.3,True


### DataFrame에서 행 선택, 조작

In [28]:
# [::] 슬라이싱 사용
df[0:3]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False


In [29]:
# 하지만 loc, iloc 함수를 더 사용
df.loc[3] # 반환 형태는 Series

Info
word            go
points         5.2
many             2
nanTest         40
more           2.0
new_point      3.2
high_point    True
Name: 3, dtype: object

In [30]:
df.loc[:3]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1,10,2.0,1.2,False
1,want,3.4,4,30,1.4,2.0,True
2,to,2.1,2,20,2.3,-0.2,False
3,go,5.2,2,40,2.0,3.2,True


In [31]:
df.loc[:3,'points']

Num
0    3.2
1    3.4
2    2.1
3    5.2
Name: points, dtype: float64

In [32]:
df.loc[:,'word']

Num
0        I
1     want
2       to
3       go
4    sleep
Name: word, dtype: object

In [33]:
df.loc[:,['word', 'many']] #전체 열과 두 행 출력

Info,word,many
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I,1
1,want,4
2,to,2
3,go,2
4,sleep,5


In [34]:
df.loc[1:4, "word":"more"]

Info,word,points,many,nanTest,more
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,want,3.4,4,30,1.4
2,to,2.1,2,20,2.3
3,go,5.2,2,40,2.0
4,sleep,3.3,5,50,1.0


In [None]:
# 새로운 행 삽입
df.loc['5',:] = ['!!', 4.2, 3, 3.2]
df

In [37]:
# index 번호를 사용하는 ilc 함수
df.iloc[3]

Info
word            go
points         5.2
many           2.0
nanTest       40.0
more           2.0
new_point      3.2
high_point    True
Name: 3, dtype: object

In [38]:
df.iloc[[0, 3, 5], [1, 2, 0]]

Info,points,many,word
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.2,1.0,I
3,5.2,2.0,go
5,,,


In [39]:
df.iloc[:, [2, 3]]

Info,many,nanTest
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,10.0
1,4.0,30.0
2,2.0,20.0
3,2.0,40.0
4,5.0,50.0
5,,


In [40]:
df.iloc[1, 3]

30.0

### DataFrame의 Boolean Indexing

In [41]:
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1.0,10.0,2.0,1.2,False
1,want,3.4,4.0,30.0,1.4,2.0,True
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True
4,sleep,3.3,5.0,50.0,1.0,2.3,True
5,,,,,,,


In [42]:
# nanTest와 비교하여 boolean data
df['nanTest'] > 35

Num
0    False
1    False
2    False
3     True
4     True
5    False
Name: nanTest, dtype: bool

In [43]:
# nanTest와 비교하여 True인 모든 행의 값
df.loc[df['nanTest'] > 35, :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,go,5.2,2.0,40.0,2.0,3.2,True
4,sleep,3.3,5.0,50.0,1.0,2.3,True


In [45]:
df.loc[df['many'] == 2, :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True


In [46]:
# 논리 연산도 가능
df.loc[(df['many'] > 3) & (df['points'] < 3), :]

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [47]:
# 새로운 값 대입
df.loc[df['word'] == 'sleep', 'word'] = 'home'

In [48]:
df

Info,word,points,many,nanTest,more,new_point,high_point
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,I,3.2,1.0,10.0,2.0,1.2,False
1,want,3.4,4.0,30.0,1.4,2.0,True
2,to,2.1,2.0,20.0,2.3,-0.2,False
3,go,5.2,2.0,40.0,2.0,3.2,True
4,home,3.3,5.0,50.0,1.0,2.3,True
5,,,,,,,


### Data

In [49]:
# DataFrame을 만들 때 index, column을 설정하지 않을 때 0부터 시작하는 정수형 숫자로 입력
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,1.540378,2.107857,0.015991,0.266569
1,-0.350205,0.011101,-0.615892,-0.978691
2,-1.5257,0.201026,-0.59293,-0.066056
3,1.916667,2.284871,0.104085,1.146015
4,0.071847,-1.286016,-1.612054,-0.506494
5,-0.387793,1.197102,0.784722,0.292135


In [50]:
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20211124', periods=6)
# pandas의 함수 date_range는 datetime 자료형으로 구성된 날짜, 시각 등을 알 수 있는 자료형을 만드는 함수
df.index

DatetimeIndex(['2021-11-24', '2021-11-25', '2021-11-26', '2021-11-27',
               '2021-11-28', '2021-11-29'],
              dtype='datetime64[ns]', freq='D')

In [51]:
df

Unnamed: 0,A,B,C,D
2021-11-24,1.540378,2.107857,0.015991,0.266569
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056
2021-11-27,1.916667,2.284871,0.104085,1.146015
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494
2021-11-29,-0.387793,1.197102,0.784722,0.292135


In [52]:
# np.nan은 NaN값을 의미
df['F'] = [1.0, np.nan, 3.5, 4.2, np.nan, 2.8]
df

Unnamed: 0,A,B,C,D,F
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8


In [65]:
# NaN 없애기
# 행의 값 중 하나라도 nan이면 그 행을 바로 drop
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F,G


In [56]:
df['G'] = np.nan
df

Unnamed: 0,A,B,C,D,F,G
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0,
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5,
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2,
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8,


In [61]:
# 행의 값이 모두 nan일 때 그 행을 바로 drop
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F,G
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0,
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5,
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2,
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8,


In [66]:
# nan값에 값 넣기
df.fillna(value=2.3)

Unnamed: 0,A,B,C,D,F,G
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0,2.3
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,2.3,2.3
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5,2.3
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2,2.3
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,2.3,2.3
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8,2.3


In [68]:
df.isnull()

Unnamed: 0,A,B,C,D,F,G
2021-11-24,False,False,False,False,False,True
2021-11-25,False,False,False,False,True,True
2021-11-26,False,False,False,False,False,True
2021-11-27,False,False,False,False,False,True
2021-11-28,False,False,False,False,True,True
2021-11-29,False,False,False,False,False,True


In [69]:
# F열에서 nan값을 포함한 행만 추출
df.loc[df.isnull()['F'], :]

Unnamed: 0,A,B,C,D,F,G
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,,
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,,


In [72]:
pd.to_datetime("20211125")

Timestamp('2021-11-25 00:00:00')

In [73]:
# 특정 행 drop
df.drop(pd.to_datetime('20211128'))

Unnamed: 0,A,B,C,D,F,G
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0,
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5,
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8,


In [77]:
# 특정 행 2개 drop
df.drop([pd.to_datetime('20211128'), pd.to_datetime('20211129')])

Unnamed: 0,A,B,C,D,F,G
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0,
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5,
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2,


In [None]:
# 특정 열 삭제
df.drop('G', axis = 1)

In [82]:
df

Unnamed: 0,A,B,C,D,F
2021-11-24,1.540378,2.107857,0.015991,0.266569,1.0
2021-11-25,-0.350205,0.011101,-0.615892,-0.978691,
2021-11-26,-1.5257,0.201026,-0.59293,-0.066056,3.5
2021-11-27,1.916667,2.284871,0.104085,1.146015,4.2
2021-11-28,0.071847,-1.286016,-1.612054,-0.506494,
2021-11-29,-0.387793,1.197102,0.784722,0.292135,2.8


In [85]:
# 특정 열 2개 삭제
df = df.drop(['D', 'F'], axis = 1)

In [86]:
df

Unnamed: 0,A,B,C
2021-11-24,1.540378,2.107857,0.015991
2021-11-25,-0.350205,0.011101,-0.615892
2021-11-26,-1.5257,0.201026,-0.59293
2021-11-27,1.916667,2.284871,0.104085
2021-11-28,0.071847,-1.286016,-1.612054
2021-11-29,-0.387793,1.197102,0.784722


### Data 분석용 함수들

In [87]:
data = [[1.3, np.nan],
       [4.3, 1.3],
       [np.nan, np.nan],
       [3.3, 1.2]]
df = pd.DataFrame(data, columns=[0, 1], index=['a', 'b', 'c', 'd'])

In [89]:
df

Unnamed: 0,0,1
a,1.3,
b,4.3,1.3
c,,
d,3.3,1.2


In [88]:
# 행방향으로의 합(각 열의 합)
df.sum(axis=0)

0    8.9
1    2.5
dtype: float64

In [90]:
# 열방향으로의 합(각 행의 합)
df.sum(axis=1)

a    1.3
b    5.6
c    0.0
d    4.5
dtype: float64

In [91]:
# skipna; NaN값을 배제하는가
df.sum(axis=1, skipna=False)

a    NaN
b    5.6
c    NaN
d    4.5
dtype: float64

In [92]:
df.loc['b'].sum()

5.6

### pandas에서 DataFrame 적용되는 함수들
#### sum() - 총합 반환
#### count() - NaN이 아닌 전체 성분 갯수 반환
#### min, max() - 최솟, 최댓값 반환
#### argmin, argmax() - 최솟, 최댓값의 인덱스 반환
#### idxmin, idxmax() - 전체 인덱스 중 최솟, 최댓값 반환
#### quantile() - 전체 성분의 특정 사분위수에 해당하는 값 반환(0~1)
#### mean() - 평균값 반환
#### median() - 중간값 반환
#### mad() - 절대편차의 평균값 반환
#### std, var() - 전체 성분의 표준편차, 분산 반환
#### cumsum() - 첫 번째 성분부터 각 성분까지의 누적합 반환
#### cumprod() - 첫 번째 성분부터 각 성분까지의 누적곱 반환

In [100]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                 columns=['A', 'B', 'C', 'D'],
                 index=pd.date_range("20211125", periods=6))
df2

Unnamed: 0,A,B,C,D
2021-11-25,-1.056307,-0.326885,-0.458411,-1.14342
2021-11-26,-1.890858,0.913156,-0.609975,1.311395
2021-11-27,-1.815141,-1.219211,-1.393182,-1.712434
2021-11-28,-1.4705,-0.270682,0.590788,0.105109
2021-11-29,0.673874,1.354477,-0.153637,-1.056006
2021-11-30,-1.138169,2.113754,-0.258566,-0.443114


In [101]:
# A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

0.4452151887705657

In [102]:
# B열과 C열의 공분산 구하기
df2['B'].cov(df2['C'])

0.27053934802290425

#### 정렬함수 및 기타함수

In [103]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])
df2

Unnamed: 0,D,B,C,A
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169
2021-11-28,0.105109,-0.270682,0.590788,-1.4705
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141
2021-11-26,1.311395,0.913156,-0.609975,-1.890858
2021-11-29,-1.056006,1.354477,-0.153637,0.673874
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307


In [104]:
# index, column의 순서가 섞여있을 때
# index를 오름차순으로 정렬
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307
2021-11-26,1.311395,0.913156,-0.609975,-1.890858
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141
2021-11-28,0.105109,-0.270682,0.590788,-1.4705
2021-11-29,-1.056006,1.354477,-0.153637,0.673874
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169


In [105]:
# column을 기준으로
df2.sort_index(axis=1) 

Unnamed: 0,A,B,C,D
2021-11-30,-1.138169,2.113754,-0.258566,-0.443114
2021-11-28,-1.4705,-0.270682,0.590788,0.105109
2021-11-27,-1.815141,-1.219211,-1.393182,-1.712434
2021-11-26,-1.890858,0.913156,-0.609975,1.311395
2021-11-29,0.673874,1.354477,-0.153637,-1.056006
2021-11-25,-1.056307,-0.326885,-0.458411,-1.14342


In [106]:
# 내림차순으로
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169
2021-11-29,-1.056006,1.354477,-0.153637,0.673874
2021-11-28,0.105109,-0.270682,0.590788,-1.4705
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141
2021-11-26,1.311395,0.913156,-0.609975,-1.890858
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307


In [113]:
# 값 기준 정렬
# A열 값들을 오름차순 정렬
df2.sort_values(by='A')

Unnamed: 0,D,B,C,A
2021-11-26,1.311395,0.913156,-0.609975,-1.890858
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141
2021-11-28,0.105109,-0.270682,0.590788,-1.4705
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307
2021-11-29,-1.056006,1.354477,-0.153637,0.673874


In [114]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["a", 'b', 'c', 'd', 'e', 'f']
df2 

Unnamed: 0,D,B,C,A,E,F
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169,0,a
2021-11-28,0.105109,-0.270682,0.590788,-1.4705,2,b
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141,2,c
2021-11-26,1.311395,0.913156,-0.609975,-1.890858,3,d
2021-11-29,-1.056006,1.354477,-0.153637,0.673874,5,e
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307,5,f


In [115]:
# E, F열을 고려하여 오름차순으로
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2021-11-30,-0.443114,2.113754,-0.258566,-1.138169,0,a
2021-11-28,0.105109,-0.270682,0.590788,-1.4705,2,b
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141,2,c
2021-11-26,1.311395,0.913156,-0.609975,-1.890858,3,d
2021-11-29,-1.056006,1.354477,-0.153637,0.673874,5,e
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307,5,f


In [119]:
# 지정한 행 or 열에서 중복 없는 값만 얻기
df2['F'].unique()

array(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)

In [120]:
#지정한 헹 or 열에서 값에 따른 갯수
df2['E'].value_counts()

2    2
5    2
0    1
3    1
Name: E, dtype: int64

In [126]:
# 지정한 행 or 열에서 특정 값이 있는지 확인
df2['E'].isin([2])

2021-11-30    False
2021-11-28     True
2021-11-27     True
2021-11-26    False
2021-11-29    False
2021-11-25    False
Name: E, dtype: bool

In [127]:
df2.loc[df2['E'].isin([2, 5]),:]

Unnamed: 0,D,B,C,A,E,F
2021-11-28,0.105109,-0.270682,0.590788,-1.4705,2,b
2021-11-27,-1.712434,-1.219211,-1.393182,-1.815141,2,c
2021-11-29,-1.056006,1.354477,-0.153637,0.673874,5,e
2021-11-25,-1.14342,-0.326885,-0.458411,-1.056307,5,f


In [129]:
df3 = pd.DataFrame(np.random.randn(4, 3), 
                   columns=['a', 'c', 'b'], 
                   index=['today', 'I', 'go', 'home'])
df3

Unnamed: 0,a,c,b
today,0.378107,-0.236226,0.542216
I,-0.392722,0.512885,-0.616175
go,-0.203758,-0.183565,-0.109103
home,2.382386,-1.852909,1.094763


In [130]:
# lamnda : 런타임에서 생성하고 사용할 수 있는 함수
func = lambda x: x.max() - x.min()

In [131]:
# apply : 함수 적용
df3.apply(func, axis=0)

a    2.775108
c    2.365794
b    1.710938
dtype: float64