In [1]:
import pandas as pd
import numpy as np

## 1. 조건 (Mask) 생성 및 적용 결과 확인하기
- DataFrame 레벨 조건 생성 및 적용 결과
- Series 레벨 조건 생성 및 적용 결과

In [24]:
df = pd.DataFrame(np.random.rand(6, 6),\
    columns=['A', 'B', 'C', 'D', 'E', 'F'])
df

Unnamed: 0,A,B,C,D,E,F
0,0.490002,0.236682,0.62043,0.713736,0.146052,0.980088
1,0.644666,0.902812,0.911763,0.278795,0.94643,0.131201
2,0.688285,0.093478,0.782924,0.602689,0.314115,0.269043
3,0.254622,0.536576,0.486748,0.674103,0.656383,0.407015
4,0.243396,0.70364,0.328009,0.955008,0.104341,0.23575
5,0.560185,0.478441,0.83167,0.232693,0.771186,0.60476


### DataFrame 레벨 조건 생성 및 적용 결과

In [25]:
df > 0.5

Unnamed: 0,A,B,C,D,E,F
0,False,False,True,True,False,True
1,True,True,True,False,True,False
2,True,False,True,True,False,False
3,False,True,False,True,True,False
4,False,True,False,True,False,False
5,True,False,True,False,True,True


In [19]:
df[df > 0.5]

Unnamed: 0,A,B,C,D,E,F
0,,0.922102,,,0.732188,
1,,,0.538316,0.549558,,0.76686
2,0.66497,,,0.586038,,0.892312
3,,,,,0.596678,0.645663
4,0.814416,,0.917453,0.548274,,0.565958
5,,,,,,0.639728


In [20]:
# DataFrame 레벨의 조건(Mask)를 Series 에 적용했을 때 -> Error
df['B'][df > 0.5]

TypeError: Indexing a Series with DataFrame is not supported, use the appropriate DataFrame column

### Series 레벨 조건 생성 및 적용 결과

In [21]:
df['A'] > 0.5

0    False
1    False
2     True
3    False
4     True
5    False
Name: A, dtype: bool

In [22]:
df[df['A'] > 0.5]

Unnamed: 0,A,B,C,D,E,F
2,0.66497,0.013002,0.427867,0.586038,0.44611,0.892312
4,0.814416,0.429033,0.917453,0.548274,0.191362,0.565958


In [23]:
df['B'][df['A'] > 0.5]

2    0.013002
4    0.429033
Name: B, dtype: float64

## 2. 여러 조건을 결합하여 Boolean Indexing 하기

In [27]:
df = pd.read_csv('data_raw/baseball.csv')
df

Unnamed: 0,id,player,year,stint,team,lg,g,ab,r,h,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,88641,womacto01,2006,2,CHN,NL,19,50,6,14,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
1,88643,schilcu01,2006,1,BOS,AL,31,2,0,1,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,88645,myersmi01,2006,1,NYA,AL,62,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,88649,helliri01,2006,1,MIL,NL,20,3,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
4,88650,johnsra05,2006,1,NYA,AL,33,6,0,1,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,89525,benitar01,2007,2,FLO,NL,34,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
96,89526,benitar01,2007,1,SFN,NL,19,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
97,89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
98,89533,aloumo01,2007,1,NYN,NL,87,328,51,112,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


In [34]:
# 타석(ab)에 100번 이상 들어서고, 
# 안타를 100개 이상 친 타자만
# (ab>=100) & (h>=100)
(df['ab']>=100) & (df['h']>=100)

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98     True
99    False
Length: 100, dtype: bool

In [35]:
df[(df['ab']>=100) & (df['h']>=100)]\
[['id', 'player', 'year','team', 'ab','h']].head()

Unnamed: 0,id,player,year,team,ab,h
5,88652,finlest01,2006,SFN,426,105
6,88653,gonzalu01,2006,ARI,586,159
22,89347,vizquom01,2007,SFN,513,126
28,89360,thomeji01,2007,CHA,432,119
29,89361,thomafr04,2007,TOR,531,147


In [50]:
# Boston RedSocks('BOS') 팀에서 
# 20홈럼('hr') 이상을 친 선수
(df['team']=='BOS') & (df['hr']>=20)

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [52]:
df[(df['team']=='BOS') & (df['hr']>=20)]\
[['id', 'player', 'year','team', 'ab','h', 'X2b', 'hr']]

Unnamed: 0,id,player,year,team,ab,h,X2b,hr
48,89396,ramirma02,2007,BOS,483,143,33,20
