In [1]:
import pandas as pd
import numpy as np

### 데이터 갯수 세기
판다스 패키지의 `Series`, `DataFrame` 객체의 `Count()` 메서드로 데이터의 갯수를 셀 수 있음  
단, `NaN` 값은 포함하지 않음

In [6]:
s = pd.Series(np.arange(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [7]:
s.count()

9

In [8]:
df = pd.DataFrame(np.random.randint(5, size=(4,4)), dtype=float)
df.iloc[2, 3] = np.nan
df

Unnamed: 0,0,1,2,3
0,4.0,1.0,4.0,0.0
1,0.0,0.0,4.0,4.0
2,0.0,2.0,1.0,
3,2.0,4.0,0.0,4.0


In [10]:
# 열 기준으로 그룹화하여 카운트
df.count()

0    4
1    4
2    4
3    3
dtype: int64

In [11]:
import seaborn

In [12]:
titanic = seaborn.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [13]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

### 카테고리 값 세기
`Series` 객체에 값이 정수, 문자열, 카테고리 값인 경우에 `value_counts()` 메서드로 각 값의 개수를 셀 수 있음  
`DataFrame` 객체에는 `value_counts()` 메서드가 존재하지 않아 각 열마다 따로 개수를 세야함

In [14]:
s2 = pd.Series(np.random.randint(6, size=250))
s2.tail()

245    2
246    5
247    0
248    2
249    2
dtype: int32

In [15]:
s2.value_counts()

2    51
1    45
0    44
3    40
5    37
4    33
Name: count, dtype: int64

In [18]:
df[0].value_counts()

0
0.0    2
4.0    1
2.0    1
Name: count, dtype: int64

### 정렬
`sort_index()`, `sort_value()` 메서드로 정렬 가능  
`sort_index()` : 인덱스 기준으로 정렬   
`sort_value()` : 값 기준으로 정렬, 만약 NaN값이 있다면 무조껀 맨뒤로 지정됨  
(기본적으로 오름차순 정렬, 만약 내림차순 정렬을 하고 싶으면 `ascending = False`를 지정)

In [20]:
s2.value_counts().sort_index() # 인덱스 기준으로 정렬됨

0    44
1    45
2    51
3    40
4    33
5    37
Name: count, dtype: int64

In [22]:
s.sort_values() # 값 기준으로 정렬, NaN 값은 마지막으로 이동

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

In [23]:
s.sort_values(ascending=False) # 오름차순 정렬 됨

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

데이터프레임에서 `sort_values()` 메서드를 사용할 땐 `by`인수로 정렬 기준이 될 열을 지정  
`by` 인수에 리스트를 전달하면 복수 정렬을 하게 됨

In [27]:
df

Unnamed: 0,0,1,2,3
0,4.0,1.0,4.0,0.0
1,0.0,0.0,4.0,4.0
2,0.0,2.0,1.0,
3,2.0,4.0,0.0,4.0


In [25]:
df.sort_values(by=2)

Unnamed: 0,0,1,2,3
3,2.0,4.0,0.0,4.0
2,0.0,2.0,1.0,
0,4.0,1.0,4.0,0.0
1,0.0,0.0,4.0,4.0


In [26]:
df.sort_values(by=[2, 0]) # 첫번째 열 값 기준으로 오름차순, 뒤에 오는 열은 앞에 열이 중복일 경우 오름차순함

Unnamed: 0,0,1,2,3
3,2.0,4.0,0.0,4.0
2,0.0,2.0,1.0,
1,0.0,0.0,4.0,4.0
0,4.0,1.0,4.0,0.0


##### 파이썬으로 다음 연산을 수행한다.

`sort_values` 메서드를 사용하여 타이타닉호 승객에 대해  
성별(sex) 인원수,   
나이별(age) 인원수,   
선실별(class) 인원수,   
사망/생존(alive) 인원수를 구하라.  

In [34]:
titanic['sex'].value_counts().sort_values()

sex
female    314
male      577
Name: count, dtype: int64

In [35]:
titanic['age'].value_counts().sort_values()

age
74.00     1
34.50     1
0.42      1
0.67      1
66.00     1
         ..
28.00    25
19.00    25
18.00    26
22.00    27
24.00    30
Name: count, Length: 88, dtype: int64

In [36]:
titanic['class'].value_counts().sort_values()

class
Second    184
First     216
Third     491
Name: count, dtype: int64

In [37]:
titanic['alive'].value_counts().sort_values()

alive
yes    342
no     549
Name: count, dtype: int64

### 행 / 열 합계  
`sum()` 메서드로 행 또는 열의 합계를 구할 수 있음  
`axis` 인수로 1을 지정하면 행의 합, 0(default)을 지정하면 열의 합

In [41]:
df2 = pd.DataFrame(np.random.randint(10, size=(4, 6)))
df2

Unnamed: 0,0,1,2,3,4,5
0,9,7,2,4,9,2
1,0,9,7,1,9,2
2,0,7,1,8,4,2
3,4,4,2,8,3,1


In [42]:
df2.sum(axis=1)

0    33
1    28
2    22
3    22
dtype: int64

In [43]:
df2['RowSum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,RowSum
0,9,7,2,4,9,2,33
1,0,9,7,1,9,2,28
2,0,7,1,8,4,2,22
3,4,4,2,8,3,1,22


In [44]:
df2.sum()

0          13
1          27
2          12
3          21
4          25
5           7
RowSum    105
dtype: int64

In [45]:
df2.loc['ColTotal', :] = df2.sum() # 행추가
df2

Unnamed: 0,0,1,2,3,4,5,RowSum
0,9.0,7.0,2.0,4.0,9.0,2.0,33.0
1,0.0,9.0,7.0,1.0,9.0,2.0,28.0
2,0.0,7.0,1.0,8.0,4.0,2.0,22.0
3,4.0,4.0,2.0,8.0,3.0,1.0,22.0
ColTotal,13.0,27.0,12.0,21.0,25.0,7.0,105.0


In [47]:
df2.mean(axis=1)

0            9.428571
1            8.000000
2            6.285714
3            6.285714
ColTotal    30.000000
dtype: float64

In [48]:
df2.mean()

0          5.2
1         10.8
2          4.8
3          8.4
4         10.0
5          2.8
RowSum    42.0
dtype: float64

In [62]:
titanic['age'].mean()

29.69911764705882

In [61]:
titanic.loc[titanic.sex == 'female', 'age'].mean()

27.915708812260537

In [71]:
titanic.loc[(titanic.pclass == 1) & (titanic.sex == 'female'), 'age'].mean()

34.61176470588235

In [86]:
df3 = pd.DataFrame({
    'A': [1, 3, 4, 3, 4],
    'B': [2, 3, 1, 2, 3],
    'C': [1, 5, 2, 4, 4]
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [84]:
def getMean(x):
  return x.max() - x.min()

df3.apply(getMean)

A    3
B    2
C    4
dtype: int64

In [87]:
df3.apply(getMean, axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [88]:
df3.apply(pd.value_counts)

  df3.apply(pd.value_counts)


Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [91]:
def checkAge (row):
  result = None
  if row.age >= 20:
    result = 'adult'
  else:
    result = 'child'
  return result


titanic['adult/child'] = titanic.apply(checkAge, axis=1)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult/child
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,adult
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,adult
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,adult
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,adult
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,adult
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,child
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,adult
