# 본문

In [1]:
# 모듈 import
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# 데이터셋
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.describe() # 수치형 컬럼에 대한 통계를 보여줌

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# 문자형 컬럼에 대한 통계표
df.describe(include='object')

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


In [7]:
# 데이터의 개수
df.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [8]:
# 단일컬럼의 개수를 구하는 경우
df['age'].count()

714

In [9]:
# 전체평균
df.mean()

  df.mean()


survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [10]:
# 컬럼에 대한 평균
df['age'].mean()

29.69911764705882

In [11]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [12]:
# 조건별 평균
# 성인 남성의 나이 평균
df.loc[df['adult_male']==True,'age'].mean()

33.17312348668281

In [None]:
# skipna=True 옵션
# 만약, skipna=False로 설정하게 된다면, NaN 값이 있는 column은 NaN 값으로 출력 됩니다.

In [13]:
df.mean(skipna=False)

  df.mean(skipna=False)


survived       0.383838
pclass         2.308642
age                 NaN
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [14]:
df.mean(skipna=True)

  df.mean(skipna=True)


survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [23]:
print(f'나이평균: {df.age.mean():.2f}')
print(f'나이중앙값 : {df.age.median():.2f}') 
print(f'나이 중앙값과 평균의 차이 : {df.age.mean()-df.age.median():.2f}')

나이평균: 29.70
나이중앙값 : 28.00
나이 중앙값과 평균의 차이 : 1.70


In [24]:
# sum() : 합계
df.loc[:,['age','fare']].sum()

age     21205.1700
fare    28693.9493
dtype: float64

In [25]:
# 단일 column에 대한 합계 출력
df['fare'].sum()

28693.9493

In [26]:
# 누적합, 누적곱
# cumsum(), cumprod()
df['age'].cumsum()

0         22.00
1         60.00
2         86.00
3        121.00
4        156.00
         ...   
886    21128.17
887    21147.17
888         NaN
889    21173.17
890    21205.17
Name: age, Length: 891, dtype: float64

In [27]:
df['age'].cumprod()

0            22.0
1           836.0
2         21736.0
3        760760.0
4      26626600.0
          ...    
886           inf
887           inf
888           NaN
889           inf
890           inf
Name: age, Length: 891, dtype: float64

In [36]:
# 분산
((df['fare'].values-df['fare'].values.mean())**2).sum()/((df['fare'].count()) - 1)

2469.436845743116

In [37]:
df['fare'].var()

2469.436845743117

In [38]:
# 표준편차
np.sqrt(df['fare'].var())

49.693428597180905

In [39]:
# 최소값, 최대값
df['age'].min(), df['age'].max()

(0.42, 80.0)

In [40]:
# agg - aggregation: 통합 통계 적용 
df['age'].agg(['min','max','count','mean'])

min        0.420000
max       80.000000
count    714.000000
mean      29.699118
Name: age, dtype: float64

In [43]:
# 복수의 컬럼에 agg 적용 - 컬럼에 대괄호 하나 더추가
df[['age','fare']].agg(['min','max','count','mean'])

Unnamed: 0,age,fare
min,0.42,0.0
max,80.0,512.3292
count,714.0,891.0
mean,29.699118,32.204208


In [None]:
# quantile()

In [44]:
df['age'].quantile(0.1)

14.0

In [45]:
df['age'].quantile(0.8)

41.0

In [46]:
# unique() - 고유값, nunique() - 고유값 개수
# 고유값과 고유값의 개수를 구하고자 할 때 사용합니다.

In [47]:
df['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [48]:
df['who'].nunique()

3

In [None]:
# mode() : 최빈값

In [49]:
df['who'].mode()

0    man
Name: who, dtype: object

In [50]:
df['who'].value_counts()

man      537
woman    271
child     83
Name: who, dtype: int64

In [51]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [52]:
df['deck'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [None]:
# corr() : 상관관계

In [53]:
df.corr()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [54]:
df.corr()['survived']

survived      1.000000
pclass       -0.338481
age          -0.077221
sibsp        -0.035322
parch         0.081629
fare          0.257307
adult_male   -0.557080
alone        -0.203367
Name: survived, dtype: float64

# 연습문제

- 다음 조건을 만족하는 승객의 나이 평균과 조건을 만족하는 데이터의 개수를 구하세요.
- fare를 30 이상 40 미만 지불한 승객
- pclass는 1등급

In [55]:
df.loc[(df['fare']>=30)&(df['fare']<40)&(df['pclass']==1),['age']].mean()

age    44.095238
dtype: float64

In [56]:
df.loc[(df['fare']>=30)&(df['fare']<40)&(df['pclass']==1),['age']].count()

age    21
dtype: int64

- 다이아몬드 데이터를 활용하여 다음의 문제를 풀어주세요

In [57]:
diamond = sns.load_dataset('diamonds')
diamond

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


- depth의 최소값을 구하세요
- carat에 대한 평균과 분산을 동시에 출력하세요

In [58]:
diamond['depth'].min()

43.0

In [59]:
diamond['carat'].mean(), diamond['carat'].var()

(0.7979397478679852, 0.22468665982273753)

- x, y에 대한 합계와 표준편차를 출력하세요

In [61]:
diamond[['x','y']].agg(['sum','std'])

Unnamed: 0,x,y
sum,309138.62,309320.33
std,1.121761,1.142135


- 펭귄 데이터를 활용하여 다음의 문제를 풀어주세요
- species 컬럼의 고유값을 출력해 주세요
- island 컬럼의 최빈값을 출력해 주세요
- body_mass_g 컬럼의 10% 분위수 값(하위 10%)을 출력해 주세요
- body_mass_g 컬럼의 80% 분위수 값(상위 20%)을 출력해 주세요

In [63]:
penguin = sns.load_dataset('penguins')
penguin.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [64]:
penguin['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [65]:
penguin['island'].mode()

0    Biscoe
Name: island, dtype: object

In [66]:
penguin['body_mass_g'].quantile(0.1)

3300.0

In [67]:
penguin['body_mass_g'].quantile(0.8)

4950.0