# Lec 3 – 데이터통계량

***Reference***

*1. 연세대학교 컴퓨팅사고 강의자료*

*2. http://pythonstudy.xyz/python/article/402-numpy-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0*

*3. 위키피디아*

*4. https://numpy.org/doc/stable/index.html*

*5. https://pandas.pydata.org/docs/index.html*

## 0. 모듈 import

In [2]:
# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 공통 모듈 임포트
import numpy as np
import pandas as pd

## 1. 라이브러리 활용

### 1.1. numpy

- **NumPy 특징**
    
    - 과학 계산을 위한 기본적인 라이브러리 패키지
    - 행렬, 벡터 계산을 고속화 하기 위한 함수들을 포함
    - ndarray에서 array 함수를 이용하여 1차원 배열(벡터), 2차원 배열(행렬), 3차원 이상의 배열(텐서) 생성 및 계산 가능

#### 1.1.1 numpy 생성

In [3]:
# 리스트를 활용한 생성
arrA = np.array([1,2,3,4,5])
arrB = np.array([6,7,8,9,10])

In [4]:
# numpy에서 제공하는 함수를 활용한 생성
arrC = np.zeros((2,2))
arrD = np.ones((3,2))
arrE = np.full((2,3),5)
arrF = np.eye(5)
arrG = np.arange(5)
print('np.zeros')
print(arrC)
print('np.ones')
print(arrD)
print('np.full')
print(arrE)
print('np.eye')
print(arrF)
print('np.arange')
print(arrG)

np.zeros
[[0. 0.]
 [0. 0.]]
np.ones
[[1. 1.]
 [1. 1.]
 [1. 1.]]
np.full
[[5 5 5]
 [5 5 5]]
np.eye
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
np.arange
[0 1 2 3 4]


#### 1.1.2 numpy 슬라이싱

In [5]:
lst = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
arr = np.array(lst)
 
# 슬라이스
a = arr[0:2, 0:2]
print(a)

[[1 2]
 [4 5]]


In [6]:
a = arr[1:, 1:]
print(a)

[[5 6]
 [8 9]]


#### 1.1.3 numpy 인덱싱

In [7]:
lst = [
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
]
a = np.array(lst)
 
# 정수 인덱싱
s = a[[0, 2], [1, 3]]
 
print(s)

[ 2 12]


In [8]:
lst = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
a = np.array(lst)
# boolean indexing
bool_indexing_array = np.array([
    [False,  True, False],
    [True, False,  True],
    [False,  True, False]
])
 
n = a[bool_indexing_array];
print(n)    

[2 4 6 8]


#### 1.1.4 numpy 연산

In [9]:
arrA = np.array([1,2,3,4,5])
arrB = np.array([6,7,8,9,10])

# 덧셈
print (arrA+arrB)
# 뺄셈
print (arrA-arrB)
# 곰셈
print (arrA*arrB)
# 나눗셈
print (arrA/arrB)

[ 7  9 11 13 15]
[-5 -5 -5 -5 -5]
[ 6 14 24 36 50]
[0.16666667 0.28571429 0.375      0.44444444 0.5       ]


In [10]:
#행렬연산
lst1 = [
    [1,2],
    [3,4]
]
 
lst2 = [
    [5,6],
    [7,8]
]
a = np.array(lst1)
b = np.array(lst2)
 
c = np.dot(a, b)
print(c)

[[19 22]
 [43 50]]


In [11]:
#배열 내 연산
a = np.array([[1,2],[3,4]])
 
s = np.sum(a)
print(s)   # 10
 
# axis=0 이면, 컬럼끼리 더함
# axis=1 이면, 행끼리 더함
s = np.sum(a, axis=0)
print(s)   # [4 6]
 
s = np.sum(a, axis=1)
print(s)   # [3 7]
 
s = np.prod(a)
print(s)   # 24

10
[4 6]
[3 7]
24


### 1.2. pandas

Pandas는 파이썬에서 사용하는 데이터분석 라이브러리로, 행과 열로 이루어진 데이터 객체를 만들어 다룰 수 있게 되며 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리한 도구임.

#### 1.2.1 pandas DataFrame 생성

In [12]:
#DataFrame을 활용한 생성
df = pd.DataFrame(data=[[1,2,3,4,5]], columns=['1c', '2c', '3c', '4c', '5c'])
df

Unnamed: 0,1c,2c,3c,4c,5c
0,1,2,3,4,5


In [13]:
#read_csv()를 활용한 생성
df = pd.read_csv('./titanic.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: './titanic.csv'

#### 1.2.2 pandas DataFrame 인덱싱

In [13]:
# columns 인덱싱
df[['Survived', 'Pclass']]

Unnamed: 0,Survived,Pclass
0,0,3
1,1,1
2,1,3
3,1,1
4,0,3
...,...,...
886,0,2
887,1,1
888,0,3
889,1,1


In [14]:
# index 인덱싱
df[:500]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C
496,497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54.0,1,0,36947,78.2667,D20,C
497,498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1000,,S
498,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.5500,C22 C26,S


In [15]:
# 조건 인덱싱
df.loc[df['Sex']=='male']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### 1.2.3 pandas DataFrame 요약

In [16]:
# DataFrame 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
# DataFrame 통계량 확인
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 2. 데이터통계량

In [18]:
df_num = df[['Age', 'SibSp', 'Parch', 'Fare']]
np_num = df_num.values

In [19]:
df_num

Unnamed: 0,Age,SibSp,Parch,Fare
0,22.0,1,0,7.2500
1,38.0,1,0,71.2833
2,26.0,0,0,7.9250
3,35.0,1,0,53.1000
4,35.0,0,0,8.0500
...,...,...,...,...
886,27.0,0,0,13.0000
887,19.0,0,0,30.0000
888,,1,2,23.4500
889,26.0,0,0,30.0000


In [20]:
np_num

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [    nan,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

### 2.1. 데이터의 중심적 경향을 나타내는 통계량

#### 2.1.1. 평균(Mean)

- 평균(Mean)은 관측치 절대 크기의 중앙을 의미하며, 산술평균(Arithmetic average)과 동일한 개념임.
- 중앙의 위치를 나타내는 대표적인 방법론임.
$$\bar{x}=\frac{1}{n}\sum_{i=1}^nx_i$$

In [21]:
#평균
print(np_num.mean(axis=0))
print(df_num.mean())

[        nan  0.52300786  0.38159371 32.20420797]
Age      29.699118
SibSp     0.523008
Parch     0.381594
Fare     32.204208
dtype: float64


#### 2.1.2. 중앙값(Median)

- 중앙값(Median)은 자료 순서의 중심을 의미함.
$$M=x_{\{(n+1)/2\}}\,\,\,\,if\,\,n\,\,is\,\,odd$$
$$M=x_{\{n/2\}}+x_{\{(n+2)/2\}}\,\,\,\,if\,\,n\,\,is\,\,even$$

In [22]:
#중앙값
print(np.median(np_num, axis=0))
print(df_num.median())

[    nan  0.      0.     14.4542]
Age      28.0000
SibSp     0.0000
Parch     0.0000
Fare     14.4542
dtype: float64


### 2.2. 데이터의 산포하는 정도를 나타내는 통계량

#### 2.2.1. 분산(Variance)

- 분산(Variance)은 각 측정치들이 평균으로부터 떨어진 정도(차이)를 제곱한 값들을 합한 후 자료의 수로 나눈 값을 의미함.


- 모분산
$$\sigma^2=var(X)=E[(X-E(X))^2]=E[X^2]-E[X]^2$$
- 표본분산
$$s^2=\sum_{i=1}^n\frac{(x_i-\bar{x})^2}{n-1}$$

In [23]:
#분산
print(np_num.var(axis=0))
print(df_num.var())

[           nan 1.21467827e+00 6.48999031e-01 2.46666531e+03]
Age       211.019125
SibSp       1.216043
Parch       0.649728
Fare     2469.436846
dtype: float64


#### 2.2.2. 표준편차(Standard deviation)

- 표준편차(Standard deviation)는 분산의 양의 제곱근임.
- 자료의 산포도를 나타내는 대표적인 방법론임.


- 모표준편차
$$\sigma=std(X)=\sqrt{E[(X-E(X))^2]}=\sqrt{E[X^2]-E[X]^2}$$
- 표본표준편차
$$s=\sqrt{s^2}=\sqrt{\sum_{i=1}^n\frac{(x_i-\bar{x})^2}{n-1}}$$

In [24]:
#표준편차
print(np_num.std(axis=0))
print(df_num.std())

[        nan  1.10212444  0.80560476 49.66553444]
Age      14.526497
SibSp     1.102743
Parch     0.806057
Fare     49.693429
dtype: float64


### 2.3. 데이터의 관계를 나타내는 지표

#### 2.3.1. 공분산(Covariance)

- 공분산(Covariance)은 2개의 확률변수의 선형관계를 나타내는 값임.
- 모공분산
$$Cov(X,Y)=E[(X-E(X))(Y-E(X))]=E(X\cdot Y)-E(X)E(Y)$$
- 표본공분산
$$Cov(X,Y)=\sum_{i=1}^n\frac{(x_i-\bar{x})(y_i-\bar{y})}{n-1}$$

In [25]:
# 공분산
print(np.cov(np_num.T))
print(df_num.cov())

[[           nan            nan            nan            nan]
 [           nan 1.21604308e+00 3.68738572e-01 8.74873381e+00]
 [           nan 3.68738572e-01 6.49728244e-01 8.66105167e+00]
 [           nan 8.74873381e+00 8.66105167e+00 2.46943685e+03]]
              Age     SibSp     Parch         Fare
Age    211.019125 -4.163334 -2.344191    73.849030
SibSp   -4.163334  1.216043  0.368739     8.748734
Parch   -2.344191  0.368739  0.649728     8.661052
Fare    73.849030  8.748734  8.661052  2469.436846


#### 2.3.2. 상관계수(Correlation Coefficient)

- 상관계수(Correlation Coefficient)는 2개의 확률변수의 선형관계를 나타내는 값임.
- 1에 가까울 수록 양의 상관관계이며, -1에 가까울 수록 음의 상관관계, 0에 가까우면 상관관계가 없음.
- 모상관계수
$$\rho_{X,Y}=\frac{Cov(X,Y)}{\sigma_X\sigma_Y}$$
- 표본상관계수
$$r_{X,Y}=\frac{\sum_{i=1}^n(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum_{i=1}^n(x_i-\bar{x})^2}\sqrt{\sum_{i=1}^n(y_i-\bar{y})^2}}$$

In [26]:
# 상관계수
print(np.corrcoef(np_num.T))
print(df_num.corr())

[[       nan        nan        nan        nan]
 [       nan 1.         0.4148377  0.15965104]
 [       nan 0.4148377  1.         0.21622494]
 [       nan 0.15965104 0.21622494 1.        ]]
            Age     SibSp     Parch      Fare
Age    1.000000 -0.308247 -0.189119  0.096067
SibSp -0.308247  1.000000  0.414838  0.159651
Parch -0.189119  0.414838  1.000000  0.216225
Fare   0.096067  0.159651  0.216225  1.000000
