# 2章 1次元データの整理

In [1]:
import numpy as np
import pandas as pd

pd.set_option("precision", 3)

In [2]:
df = pd.read_csv("../python_stat_sample/data/ch2_scores_em.csv", index_col="生徒番号")
df.head()

Unnamed: 0_level_0,英語,数学
生徒番号,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [3]:
# 英語の点数の最初の１０個を取得
scores = np.array(df["英語"])[:10]

scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58])

In [4]:
index = [chr(i+ord("A")) for i in range(10)]
scores_df = pd.DataFrame({"点数":scores},
                index=pd.Index(index, name="生徒"))

scores_df

Unnamed: 0_level_0,点数
生徒,Unnamed: 1_level_1
A,42
B,69
C,56
D,41
E,57
F,48
G,65
H,49
I,65
J,58


## 平均値

\begin{align*}
    \bar{x} = \frac{1}{N} \sum_{i=0}^{N} x_i
\end{align*}
    
- $\bar{x}: \text{average}$
- $N: \text{length of data}$
- $x_i: \text{each data in }x$

In [5]:
sum(scores)/len(scores)

55.0

In [6]:
# numpyを使った方法
np.mean(scores)

55.0

In [7]:
# pandasを使った方法
scores_df.mean()

点数    55.0
dtype: float64

## 中央値

In [8]:
# 中央値を導出するためにデータを順番に置き直す
scores_sorted = np.sort(scores)
scores_sorted

array([41, 42, 48, 49, 56, 57, 58, 65, 65, 69])

In [9]:
n = len(scores_sorted)
if n%2 == 0:
    median = (scores_sorted[n//2 - 1] + scores_sorted[n//2])/2
else:
    median = scores_sorted[n//2+1]

median

56.5

In [10]:
# numpy
np.median(scores)

56.5

In [11]:
# pandas
scores_df.median()

点数    56.5
dtype: float64

## 最頻値

In [12]:
tmp_list = [1, 1, 1, 2, 2, 3]
pd.Series(tmp_list).mode()

0    1
dtype: int64

In [13]:
# multiple modes in list
tmp_list = [i+1 for i in range(5)]
pd.Series(tmp_list).mode()

0    1
1    2
2    3
3    4
4    5
dtype: int64

## 偏差

In [14]:
mean = np.mean(scores)
deviation = scores - mean
deviation

array([-13.,  14.,   1., -14.,   2.,  -7.,  10.,  -6.,  10.,   3.])

In [15]:
# keep copy of scores_df
summary_df = scores_df.copy()
summary_df["偏差"] = deviation
summary_df

Unnamed: 0_level_0,点数,偏差
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1
A,42,-13.0
B,69,14.0
C,56,1.0
D,41,-14.0
E,57,2.0
F,48,-7.0
G,65,10.0
H,49,-6.0
I,65,10.0
J,58,3.0


In [16]:
scores_ = [50, 60, 58, 54, 51, 56, 57, 53, 52, 59]
mean_ = np.mean(scores_)
deviation_ = scores_ - mean_
deviation_

array([-5.,  5.,  3., -1., -4.,  1.,  2., -2., -3.,  4.])

In [17]:
# mean of deviation_
mean_deviation = np.mean(deviation_)

mean_deviation

0.0

### 偏差の平均が0になる理由

\begin{align*}
\frac{1}{n} \sum_{i=1}^{n}(x_i - \bar{x}) = \frac{1}{n} \sum_{i=1}^{n}x_i - \frac{1}{n} \sum_{i=1}^{n} \bar{x}\\
= \bar{x} - \bar{x}\\
= 0
\end{align*}

## 分散

In [18]:
var = np.mean(deviation ** 2)
var_np = np.var(scores) # defaults to sample variance
var_pd = scores_df.var() # defaults to unbiased variance
print(f"variance: {var}")
print(f"variance thru numpy: {var_np}")
print(f"variance thru pandas: {var_pd}")

variance: 86.0
variance thru numpy: 86.0
variance thru pandas: 点数    95.556
dtype: float64


In [19]:
summary_df["偏差二乗"] = np.square(deviation)
summary_df

Unnamed: 0_level_0,点数,偏差,偏差二乗
生徒,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,42,-13.0,169.0
B,69,14.0,196.0
C,56,1.0,1.0
D,41,-14.0,196.0
E,57,2.0,4.0
F,48,-7.0,49.0
G,65,10.0,100.0
H,49,-6.0,36.0
I,65,10.0,100.0
J,58,3.0,9.0


### 標本分散
\begin{align*}
S^2 = \frac{1}{n} \sum_{i=1}^{n}(x_i - \bar{x})^2 \\
(n > 0)
\end{align*}

### 不偏分散
\begin{align*}
\sigma^2 = \frac{1}{n-1} \sum_{i=1}^{n}(x_i - \bar{x})^2 \\
(n > 1)
\end{align*}

よって標準偏差は以下のようになる
\begin{align*}
S = \sqrt{S^2} = \sqrt{\frac{1}{n} \sum_{i=1}^{n}(x_i - \bar{x})^2} \\
(n > 0)
\end{align*}

In [22]:
# 標準偏差
np.sqrt(np.var(scores, ddof=0)) # 標本分散を使用
np.std(scores, ddof=0) # 上と同様

9.273618495495704

## 範囲
\begin{align*}
\it{Rg} = x_{max} - x_{min}
\end{align*}

In [23]:
# 範囲
np.max(scores) - np.min(scores)

28

ただし, これだと一つでも大きい値または, 小さい値があると範囲が極端になってしまう
そのため, データの上位数%と下位数%の範囲を用いる場合がある
これを<b>四分位範囲</b> (interquartile range) という

\begin{align*}
IQR = Q3 - Q1
\end{align*}

In [24]:
# 四分位範囲
scores_Q1 = np.percentile(scores, 25)
scores_Q3 = np.percentile(scores, 75)
scores_IQR = scores_Q3 - scores_Q1
scores_IQR

15.0

In [25]:
# pandas
pd.Series(scores).describe()

count    10.000
mean     55.000
std       9.775
min      41.000
25%      48.250
50%      56.500
75%      63.250
max      69.000
dtype: float64