In [105]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [106]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

In [107]:
x

[8.0, 1, 2.5, 4, 28.0]

In [108]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [109]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

In [110]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [111]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [112]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [113]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

## mean

In [114]:
# Pure Python
mean_ = sum(x) / len (x)
mean_

8.7

In [115]:
mean_ = statistics.mean(x)
mean_

8.7

In [116]:
# Statistics lib eith missing value

statistics.mean(x_with_nan)

nan

In [117]:
# Numpy - 1

mean_ = np.mean(y)
mean_

8.7

In [118]:
# Numpy - 2

mean_ = y.mean()
mean_

8.7

In [119]:
# Numpy with missing value

print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [120]:
# Numpy ignore missing value

np.nanmean(y_with_nan)

8.7

In [121]:
# Pandas

mean_ = z.mean()
mean_

8.7

In [122]:
# Pandas with missing values

z_with_nan.mean()

8.7

## Median 

In [123]:
# Statistics lib

median_ = statistics.median(x)
median_

4

In [124]:
sorted(x)

[1, 2.5, 4, 8.0, 28.0]

In [125]:
sorted(x[:-1])

[1, 2.5, 4, 8.0]

In [126]:
statistics.median_low(x[:-1])

2.5

In [127]:
statistics.median_high(x[:-1])

4

In [128]:
sorted(x_with_nan)

[1, 2.5, 4, 8.0, nan, 28.0]

In [129]:
# Statisrtics with missing values

statistics.median(x_with_nan)

6.0

In [130]:
# Numpy 

median_ = np.median(y)
median_

4.0

In [131]:
np.median(y[:-1])

3.25

In [132]:
# Numpy with missing values

np.median(y_with_nan)

nan

In [133]:
np.nanmedian(y_with_nan)

4.0

In [134]:
# Pandas with missing values

z_with_nan.median()

4.0

## Mode

In [135]:
u = [2,3,2,8,12]

v= [12,15,12,15,21,15,12]

In [136]:
# Statistics libs

mode_ = statistics.mode(u)
mode_

2

In [137]:
mode_ = statistics.mode(v)
mode_

StatisticsError: no unique mode; found 2 equally common values

In [138]:
u, v = np.array(u), np.array(v)

In [139]:
mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=array([2]), count=array([2]))

In [140]:
mode_ = scipy.stats.mode(v)
mode_

ModeResult(mode=array([12]), count=array([3]))

In [141]:
mode_.mode

array([12])

In [142]:
mode_.count

array([3])

In [143]:
u, v, w = pd.Series(u),pd.Series(v),pd.Series([2,2,math.nan])

In [144]:
# Pandas

u.mode()

0    2
dtype: int32

In [145]:
v.mode()

0    12
1    15
dtype: int32

In [146]:
w.mode()

0    2.0
dtype: float64

### Measures of Variability

In [147]:
# Statistics lib

var_ = statistics.variance(x)
var_

123.2

In [148]:
# Statistics lib with missing values

statistics.variance(x_with_nan)

nan

In [149]:
# Numpy

var_ = np.var(y, ddof=1) # ddof - bias(lebih proper/parameter default)
var_

123.19999999999999

In [150]:
var_ = y.var(ddof=1)
var_

123.19999999999999

In [151]:
# Numpy with missing values

np.var(y_with_nan, ddof=1)

nan

In [152]:
np.nanvar(y_with_nan, ddof=1)

123.19999999999999

In [153]:
# Pandas

z.var(ddof=1)

123.19999999999999

In [154]:
z_with_nan.var(ddof=1)

123.19999999999999

### Standard Deviation

In [155]:
# Statistics lib

std_ = statistics.stdev(x)
std_

11.099549540409287

In [156]:
# Numpy

np.std(y, ddof=1)

11.099549540409285

In [157]:
np.std(y_with_nan, ddof=1)

nan

In [158]:
# Pandas

z.std(ddof=1)

11.099549540409285

In [159]:
z_with_nan.std(ddof=1)

11.099549540409285

#### Skewness

In [160]:
# Scipy

scipy.stats.skew(y)

1.3061163034727836

In [161]:
scipy.stats.skew(y_with_nan)

nan

In [162]:
# Pandas

z.skew()

1.9470432273905924

In [163]:
z_with_nan.skew()

1.9470432273905924

### Precentile

In [164]:
# Numpy 

np.percentile(y, 5) # 5%

1.3

In [165]:
np.percentile(y, 95) # 95%

23.999999999999996

In [166]:
np.percentile(y, [25,50,75])

array([2.5, 4. , 8. ])

In [167]:
np.median(y)

4.0

In [168]:
# Numpy with missing values

np.percentile(y_with_nan, 5)

nan

In [169]:
np.nanpercentile(y_with_nan, 5)

1.3

### Quantile

In [170]:
# Numpy 

np.quantile(y, 0.05)

1.3

In [171]:
np.quantile(y, 0.95)

23.999999999999996

In [172]:
np.nanquantile(y_with_nan, [0.25, 0.5, 0.75])

array([2.5, 4. , 8. ])

In [173]:
# Pandas

z.quantile(0.05)

1.3

In [174]:
z.quantile(0.95)

23.999999999999996

### Range

In [175]:
# Numpy

np.ptp(y)

27.0

In [176]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [177]:
np.ptp(z)

27.0

In [178]:
np.ptp(y_with_nan)

nan

In [179]:
np.ptp(z_with_nan)

nan

In [180]:
y.max() - y.min()

27.0

In [181]:
z.max() - z.min()

27.0

In [182]:
y_with_nan.max() - y_with_nan.min()

nan

### IQR

In [186]:
quartile_ = np.quantile(y, [0.25, 0.75])
quartile_

array([2.5, 8. ])

In [187]:
quartile_[1] - quartile_[0]

5.5

### Summary of Descriptive Stats

In [189]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [191]:
# Scipy

result = scipy.stats.describe(y, ddof=1, bias=False)

In [192]:
result

DescribeResult(nobs=5, minmax=(1.0, 28.0), mean=8.7, variance=123.19999999999999, skewness=1.9470432273905927, kurtosis=3.878019618875446)

In [193]:
result.nobs

5

In [195]:
result.minmax

(1.0, 28.0)

In [196]:
result.minmax[0]

1.0

In [197]:
result.minmax[1]

28.0

In [198]:
result.mean

8.7

In [200]:
result.variance

123.19999999999999

In [201]:
result.skewness

1.9470432273905927

In [202]:
result.kurtosis

3.878019618875446

In [203]:
# Pandas

result = z.describe()

In [204]:
result

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

In [205]:
result['count']

5.0

In [206]:
result['mean']

8.7

In [207]:
result['std']

11.099549540409285

In [208]:
result['min']

1.0

In [209]:
result['max']

28.0

In [210]:
result['25%']

2.5

In [211]:
result['50%']

4.0

In [213]:
result['75%']

8.0