# 2章 1次元データの整理

In [1]:
import numpy as np
import pandas as pd

pd.set_option("precision", 3)

In [2]:
df = pd.read_csv("../python_stat_sample/data/ch2_scores_em.csv", index_col="生徒番号")
df.head()

Unnamed: 0_level_0,英語,数学
生徒番号,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [3]:
# 英語の点数の最初の１０個を取得
scores = np.array(df["英語"])[:10]

scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58])

In [4]:
index = [chr(i+ord("A")) for i in range(10)]
scores_df = pd.DataFrame({"点数":scores},
                index=pd.Index(index, name="生徒"))

scores_df

Unnamed: 0_level_0,点数
生徒,Unnamed: 1_level_1
A,42
B,69
C,56
D,41
E,57
F,48
G,65
H,49
I,65
J,58


## 平均値

\begin{align*}
    \bar{x} = \frac{1}{N} \sum_{i=0}^{N} x_i
\end{align*}
[tex:{
    \bar{x}: \text{average}\\
    N: \text{length of data}\\
    x_i: \text{each data in }x
}]

In [6]:
sum(scores)/len(scores)

55.0

In [7]:
# numpyを使った方法
np.mean(scores)

55.0

In [8]:
# pandasを使った方法
scores_df.mean()

点数    55.0
dtype: float64

## 中央値

In [9]:
# 中央値を導出するためにデータを順番に置き直す
scores_sorted = np.sort(scores)
scores_sorted

array([41, 42, 48, 49, 56, 57, 58, 65, 65, 69])

In [10]:
n = len(scores_sorted)
if n%2 == 0:
    median = (scores_sorted[n//2 - 1] + scores_sorted[n//2])/2
else:
    median = scores_sorted[n//2+1]

median

56.5

In [11]:
# numpy
np.median(scores)

56.5

In [12]:
# pandas
scores_df.median()

点数    56.5
dtype: float64

## 最頻値