
# Measures of Central Tendency
Measures of central tendency menunjukkan nilai tengah atau pusat dari kumpulan data. Ada beberapa definisi tentang apa yang dianggap sebagai pusat kumpulan data. Dalam sesi ini, kita akan mempelajari cara mengidentifikasi dan menghitung measures of central tendency berupa:

* Mean
* Weighted mean
* Geometric mean
* Harmonic mean
* Median
* Mode

# Mean


In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]


In [None]:
mean_ = sum(x) / len(x)
mean_

8.7

In [None]:
mean_ = statistics.mean(x)
print(mean_)


8.7


In [None]:
mean_ = statistics.mean(x_with_nan)
print(mean_)

nan


In [None]:
mean_ = np.mean(y)
mean_

8.7

In [None]:
mean_ = y.mean()
mean_

8.7

In [None]:
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [None]:
np.nanmean(y_with_nan)


8.7

In [None]:
mean_ = z.mean()
mean_

8.7

Seperti yang kalian lihat, .mean() digunakan dengan cara yang sama seperti dalam kasus NumPy. Namun, .mean() dari Pandas mengabaikan nilai nan secara default:



In [None]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

In [None]:
z_with_nan.mean()


8.7



# Weighted Mean


In [None]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8


4.8

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)



6.95


In [None]:
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95


In [None]:
y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


In [None]:
(w * y).sum() / w.sum()


6.95

## Harmonic Mean



In [None]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [None]:
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [None]:
scipy.stats.hmean(y)


2.7613412228796843

In [None]:
scipy.stats.hmean(z)


2.7613412228796843

# Geometric Mean


In [None]:
x = [8.0, 1, 2.5, 4, 28.0]

In [None]:
gmean = 1

for item in x:
    gmean *= item

gmean **= 1 / len(x)
gmean

4.677885674856041

In [None]:
scipy.stats.gmean(y)


4.67788567485604

In [None]:
scipy.stats.gmean(z)


4.67788567485604

# Median




In [None]:
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])

median_

4

In [None]:
x

[8.0, 1, 2.5, 4, 28.0]

In [None]:
statistics.median_low(x[:-1])


2.5

In [None]:
statistics.median_high(x[:-1])


4

In [None]:
print(statistics.median(x_with_nan))
print(statistics.median_low(x_with_nan))
print(statistics.median_high(x_with_nan))

6.0
4
8.0


In [None]:
median_ = np.median(y)
print(median_)


4.0


In [None]:
median_ = np.median(y[:-1])
print(median_)

3.25


# Mode


In [None]:
u = [2, 3, 2, 8, 12]

v = [12, 15, 12, 15, 21, 15, 12]

mode_ = max((u.count(item), item) for item in set(u))[1]
mode_
# Kita bisa mendapatkan mode dengan statistics.mode()



2

In [None]:
mode_ = statistics.mode(u)
mode_
# Kita juga bisa mendapatkan mode dengan scipy.stats.mode():



2

In [None]:
u, v = np.array(u), np.array(v)

mode_ = scipy.stats.mode(u)
mode_


ModeResult(mode=array([2]), count=array([2]))

In [None]:
mode_ = scipy.stats.mode(v)
mode_


ModeResult(mode=array([12]), count=array([3]))

In [None]:
print(mode_.mode)
print(mode_.count)


[12]
[3]


In [None]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])

print(u.mode())

print(v.mode())

print(w.mode())

0    2
dtype: int64
0    12
1    15
dtype: int64
0    2.0
dtype: float64




# Variance


In [None]:
n = len(x)

mean_ = sum(x) / n

var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_


123.19999999999999

In [None]:
var_ = statistics.variance(x)
var_

123.2

In [None]:
var_ = np.var(y, ddof=1)
var_


123.19999999999999

In [None]:
var_ = y.var(ddof=1)
var_


123.19999999999999

In [None]:
z.var(ddof=1)


123.19999999999999


# Standard Deviation


In [None]:
std_ = var_ ** 0.5
std_

11.099549540409285

In [None]:
# Meskipun solusi ini berhasil, kita juga dapat menggunakan statistics.stdev():
std_ = statistics.stdev(x)
std_


11.099549540409287

In [None]:
np.std(y, ddof=1)


11.099549540409285

In [None]:
y.std(ddof=1)


11.099549540409285

In [None]:
# Objek pd.Series juga memiliki method .std() yang mengabaikan nan secara default:

z.std(ddof=1)


11.099549540409285

.

## Skewness



In [None]:
x = [8.0, 1, 2.5, 4, 28.0]

n = len(x)

mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_


1.9470432273905929

In [None]:
y, y_with_nan = np.array(x), np.array(x_with_nan)

scipy.stats.skew(y, bias=False)

1.9470432273905927

In [None]:
scipy.stats.skew(y_with_nan, bias=False)


nan

In [None]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

z.skew()

1.9470432273905924

In [None]:
z_with_nan.skew()


1.9470432273905924

# Percentiles


Percentiles


In [None]:
import statistics
import numpy as np 

In [None]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
#statistics.quantiles(x, n=2)
np.quantile(x, .50)

8.0

In [None]:
# statistics.quantiles(x, n=4, method='inclusive')

In [None]:
y = np.array(x)
np.percentile(y, 5)


-3.44

In [None]:
np.percentile(y, 95)


34.919999999999995

In [None]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [None]:
np.median(y)

8.0

In [None]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [None]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [None]:
np.quantile(y, 0.05)

-3.44

In [None]:
np.quantile(y, 0.95)

34.919999999999995

In [None]:
np.quantile(y, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [None]:
np.nanquantile(y_with_nan, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [None]:
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)

In [None]:
z.quantile(0.05)

-3.44

In [None]:
z.quantile(0.95)

34.919999999999995

In [None]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [None]:
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

## Ranges



In [None]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [None]:
np.median(y)

8.0

In [None]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [None]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [None]:
np.quantile(y, 0.05)

-3.44

In [None]:
np.quantile(y, 0.95)

34.919999999999995

In [None]:
np.quantile(y, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [None]:
np.nanquantile(y_with_nan, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [None]:
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)

In [None]:
z.quantile(0.05)

-3.44

In [None]:
z.quantile(0.95)

34.919999999999995

In [None]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [None]:
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [None]:
np.ptp(y)

46.0

In [None]:
np.ptp(z)

46.0

In [None]:
np.ptp(y_with_nan)

nan

In [None]:
np.ptp(z_with_nan)

nan

In [None]:
print(y)

[-5.  -1.1  0.1  2.   8.  12.8 21.  25.8 41. ]


In [None]:
np.amax(y) - np.amin(y)

46.0

In [None]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

46.0

In [None]:
y.max() - y.min()

46.0

In [None]:
z.max() - z.min()

46.0

In [None]:
z_with_nan.max() - z_with_nan.min()

46.0

In [None]:
quartiles = np.quantile(y, [0.25, 0.75])

In [None]:
quartiles[1] - quartiles[0]

20.9

In [None]:
quartiles = z.quantile([0.25, 0.75])

In [None]:
quartiles[0.75] - quartiles[0.25]

20.9

In [None]:
result = scipy.stats.describe(y, ddof=1, bias=False)
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [None]:
 result.nobs

9

In [None]:
result.minmax[0]

-5.0

In [None]:
result.minmax[1]

41.0

In [None]:
result.mean

11.622222222222222

In [None]:
result.variance

228.75194444444446

In [None]:
result.skewness

In [None]:
result.kurtosis

0.14770623629658886

In [None]:
result = z.describe()
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

In [None]:
result['mean']
result['std']
result['min']
result['max']
result['25%']
result['50%']
result['75%']


21.0



>>> 


In [None]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]

In [None]:
x_, y_ = np.array(x), np.array(y)

In [None]:
x__, y__ = pd.Series(x_), pd.Series(y_)

In [None]:
print(x_)
print(y_)
print(x__)
print(y__)

[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
   8   9  10]
[ 0  2  2  2  2  3  3  6  7  4  7  6  6  9  4  5  5 10 11 12 14]
0    -10
1     -9
2     -8
3     -7
4     -6
5     -5
6     -4
7     -3
8     -2
9     -1
10     0
11     1
12     2
13     3
14     4
15     5
16     6
17     7
18     8
19     9
20    10
dtype: int64
0      0
1      2
2      2
3      2
4      2
5      3
6      3
7      6
8      7
9      4
10     7
11     6
12     6
13     9
14     4
15     5
16     5
17    10
18    11
19    12
20    14
dtype: int64


In [None]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n 
mean_x

0.0

In [None]:
cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n))
           / (n - 1))
cov_xy

19.95

In [None]:
cov_matrix = np.cov(x_, y_)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [None]:
x_.var(ddof=1)

38.5

In [None]:
y_.var(ddof=1)

13.914285714285711

In [None]:
cov_xy = cov_matrix[0, 1]
cov_xy

19.95

In [None]:
cov_xy = cov_matrix[1, 0]
cov_xy

19.95

In [None]:
cov_xy = x__.cov(y__)
cov_xy 

19.95

In [None]:
cov_xy = y__.cov(x__)
cov_xy

19.95

In [None]:
var_x = sum((item - mean_x)**2 for item in x) / (n - 1)

In [None]:
var_y = sum((item - mean_y)**2 for item in y) / (n - 1)

In [None]:
 std_x, std_y = var_x ** 0.5, var_y ** 0.5

In [None]:
r = cov_xy / (std_x * std_y)
r

0.861950005631606

In [None]:
r, p = scipy.stats.pearsonr(x_, y_)
print(r)
print(p)

0.861950005631606
5.122760847201171e-07


In [None]:
corr_matrix = np.corrcoef(x_, y_)
corr_matrix

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [None]:
r = corr_matrix[0, 1]
r

0.8619500056316061

In [None]:
r = corr_matrix[1, 0]
r

0.861950005631606

In [None]:
scipy.stats.linregress(x_, y_)


LinregressResult(slope=0.5181818181818181, intercept=5.714285714285714, rvalue=0.861950005631606, pvalue=5.122760847201164e-07, stderr=0.06992387660074979)

In [None]:
result = scipy.stats.linregress(x_, y_)
r = result.rvalue
r

0.861950005631606

Series Pandas memiliki method .corr() untuk menghitung correlation coefficient:



In [None]:
r = x__.corr(y__)
r

0.8619500056316061

In [None]:
r = y__.corr(x__)
r

0.861950005631606