In [248]:
import numpy as np
import pandas as pd

# NumPyの表示設定
np.set_printoptions(precision=3, suppress=True)

# Pandasの表示設定
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

df = pd.read_csv('../data/ch2_scores_em.csv', index_col='生徒番号')
df.rename(columns={'英語': 'en', '数学': 'ma'}, inplace=True)
df.rename_axis('sid', inplace=True)
print(df.head())
english_scores = np.array(df['en'])

     en  ma
sid        
1    42  65
2    69  80
3    56  63
4    41  63
5    57  76


In [249]:
df

Unnamed: 0_level_0,en,ma
sid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76
6,48,60
7,65,81
8,49,66
9,65,78
10,58,82


In [213]:
scores = np.array(df['en'])

In [214]:
scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58, 70, 47, 51, 64, 62, 70, 71,
       68, 73, 37, 65, 65, 61, 52, 57, 57, 75, 61, 47, 54, 66, 54, 54, 42,
       37, 79, 56, 62, 62, 55, 63, 57, 57, 67, 55, 45, 66, 55, 64, 66])

In [215]:
sum(scores)

2919

In [216]:
len(scores)

50

In [217]:
sum(scores) / len(scores)

58.380

In [218]:
np.mean(scores)

58.380

In [219]:
sorted_scores = np.sort(scores)
sorted_scores

array([37, 37, 41, 42, 42, 45, 47, 47, 48, 49, 51, 52, 54, 54, 54, 55, 55,
       55, 56, 56, 57, 57, 57, 57, 57, 58, 61, 61, 62, 62, 62, 63, 64, 64,
       65, 65, 65, 65, 66, 66, 66, 67, 68, 69, 70, 70, 71, 73, 75, 79])

In [220]:
n = len(sorted_scores)
if n % 2 == 0:
    m0 = sorted_scores[n//2 - 1]
    m1 = sorted_scores[n//2]
    median = (m0 + m1) / 2
else:
    median = sorted_scores[(n+1)//2 - 1]

median

57.500

In [221]:
np.median(scores)

57.500

In [222]:
pd.Series(scores).mode()

0    57
dtype: int64

In [223]:
pd.Series(scores).value_counts(dropna=False).head(5)

57    5
65    4
54    3
62    3
55    3
Name: count, dtype: int64

# 偏差

In [224]:
mean = np.mean(scores)
deviation = scores - mean
deviation

array([-16.38,  10.62,  -2.38, -17.38,  -1.38, -10.38,   6.62,  -9.38,
         6.62,  -0.38,  11.62, -11.38,  -7.38,   5.62,   3.62,  11.62,
        12.62,   9.62,  14.62, -21.38,   6.62,   6.62,   2.62,  -6.38,
        -1.38,  -1.38,  16.62,   2.62, -11.38,  -4.38,   7.62,  -4.38,
        -4.38, -16.38, -21.38,  20.62,  -2.38,   3.62,   3.62,  -3.38,
         4.62,  -1.38,  -1.38,   8.62,  -3.38, -13.38,   7.62,  -3.38,
         5.62,   7.62])

In [225]:
np.mean(deviation)

-0.000

In [226]:
df = pd.DataFrame({'scores': scores})

In [227]:
df['deviation'] = deviation
df.head(10)

Unnamed: 0,scores,deviation
0,42,-16.38
1,69,10.62
2,56,-2.38
3,41,-17.38
4,57,-1.38
5,48,-10.38
6,65,6.62
7,49,-9.38
8,65,6.62
9,58,-0.38


In [228]:
df.mean()

scores      58.38000
deviation   -0.00000
dtype: float64

# Variance　分散

In [229]:
np.mean(deviation ** 2)

94.116

In [230]:
np.var(scores, ddof=1)

96.036

In [231]:
df['squared_diviation'] = np.square(deviation)
df['variance'] = np.var(scores)
df.head()

Unnamed: 0,scores,deviation,squared_diviation,variance
0,42,-16.38,268.3044,94.1156
1,69,10.62,112.7844,94.1156
2,56,-2.38,5.6644,94.1156
3,41,-17.38,302.0644,94.1156
4,57,-1.38,1.9044,94.1156


In [232]:
df.mean()

scores              58.38000
deviation           -0.00000
squared_diviation   94.11560
variance            94.11560
dtype: float64

In [233]:
standard_deviation = np.sqrt(np.var(scores, ddof=0))
standard_deviation

9.701

In [234]:
df['standard_deviation'] = standard_deviation
df.head()

Unnamed: 0,scores,deviation,squared_diviation,variance,standard_deviation
0,42,-16.38,268.3044,94.1156,9.70132
1,69,10.62,112.7844,94.1156,9.70132
2,56,-2.38,5.6644,94.1156,9.70132
3,41,-17.38,302.0644,94.1156,9.70132
4,57,-1.38,1.9044,94.1156,9.70132


In [235]:
np.std(scores, ddof=0)

9.701

In [236]:
np.max(scores) - np.min(scores)

42

In [237]:
scores_Q1 = np.percentile(scores, 25)
scores_Q3 = np.percentile(scores, 75)
scores_IQR = scores_Q3 - scores_Q1
scores_IQR



11.000

In [238]:
np.percentile(scores, 50)

57.500

In [239]:
pd.Series(scores).describe()

count   50.00000
mean    58.38000
std      9.79981
min     37.00000
25%     54.00000
50%     57.50000
75%     65.00000
max     79.00000
dtype: float64

In [240]:
z = (scores - np.mean(scores)) / np.std(scores)
z

array([-1.688,  1.095, -0.245, -1.792, -0.142, -1.07 ,  0.682, -0.967,
        0.682, -0.039,  1.198, -1.173, -0.761,  0.579,  0.373,  1.198,
        1.301,  0.992,  1.507, -2.204,  0.682,  0.682,  0.27 , -0.658,
       -0.142, -0.142,  1.713,  0.27 , -1.173, -0.451,  0.785, -0.451,
       -0.451, -1.688, -2.204,  2.125, -0.245,  0.373,  0.373, -0.348,
        0.476, -0.142, -0.142,  0.889, -0.348, -1.379,  0.785, -0.348,
        0.579,  0.785])

In [241]:
df['z_score'] = z

In [242]:
df.head()

Unnamed: 0,scores,deviation,squared_diviation,variance,standard_deviation,z_score
0,42,-16.38,268.3044,94.1156,9.70132,-1.68843
1,69,10.62,112.7844,94.1156,9.70132,1.0947
2,56,-2.38,5.6644,94.1156,9.70132,-0.24533
3,41,-17.38,302.0644,94.1156,9.70132,-1.79151
4,57,-1.38,1.9044,94.1156,9.70132,-0.14225


In [243]:
np.mean(z), np.std(z, ddof=0)

(-0.000, 1.000)

In [244]:
deviation_scores = 50 + 10 * (scores - np.mean(scores)) / np.std(scores)
deviation_scores

array([33.116, 60.947, 47.547, 32.085, 48.578, 39.3  , 56.824, 40.331,
       56.824, 49.608, 61.978, 38.27 , 42.393, 55.793, 53.731, 61.978,
       63.009, 59.916, 65.07 , 27.962, 56.824, 56.824, 52.701, 43.424,
       48.578, 48.578, 67.132, 52.701, 38.27 , 45.485, 57.855, 45.485,
       45.485, 33.116, 27.962, 71.255, 47.547, 53.731, 53.731, 46.516,
       54.762, 48.578, 48.578, 58.885, 46.516, 36.208, 57.855, 46.516,
       55.793, 57.855])

In [245]:
df['deviation_score'] = deviation_scores

In [246]:
df.head()

Unnamed: 0,scores,deviation,squared_diviation,variance,standard_deviation,z_score,deviation_score
0,42,-16.38,268.3044,94.1156,9.70132,-1.68843,33.1157
1,69,10.62,112.7844,94.1156,9.70132,1.0947,60.94696
2,56,-2.38,5.6644,94.1156,9.70132,-0.24533,47.54673
3,41,-17.38,302.0644,94.1156,9.70132,-1.79151,32.08491
4,57,-1.38,1.9044,94.1156,9.70132,-0.14225,48.57751


# 度数分布表
- 階級class　１０点から２０点などの区間のこと
- 度数frequency その階級に分布している値の数のこと
- 階級幅 binのことかな
  

In [255]:
english_scores = np.array(df['en'])
print(pd.Series(english_scores).describe())

count   50.00000
mean    58.38000
std      9.79981
min     37.00000
25%     54.00000
50%     57.50000
75%     65.00000
max     79.00000
dtype: float64


In [256]:
freq, _ = np.histogram(english_scores, bins=10, range=(0, 100))
freq

array([ 0,  0,  0,  2,  8, 16, 18,  6,  0,  0])

In [257]:
freq_class = [f'{i}~{i+10}' for i in range(0, 100, 10)]
freq_class
freq_dist_df = pd.DataFrame(
    {'frequency': freq},
    index=pd.Index(freq_class, name='class')
  
)
freq_dist_df

Unnamed: 0_level_0,frequency
class,Unnamed: 1_level_1
0~10,0
10~20,0
20~30,0
30~40,2
40~50,8
50~60,16
60~70,18
70~80,6
80~90,0
90~100,0
