In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [3]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Measure of Central Tendency

### Mean

In [11]:
# Using simple method
mean_ = sum(x)/len(x)
print(mean_)

# Using statistics package
mean_ = statistics.mean(x)
print(mean_)

mean_ = statistics.mean(x_with_nan)
print(mean_)

# Using numpy.mean
mean_ = np.mean(y)
print(mean_)

print(np.nanmean(y_with_nan))  #using nanmean for calculate without nan value

# Using pd series, which default ignore nan
print(z_with_nan.mean())

8.7
8.7
nan
8.7
8.7
8.7


### Weighted Mean

In [19]:
# Using basic
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x)))/sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(x, w))/sum(w)
print(wmean)


# Using NP Array and PD Series
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

# Using element wise product
(w*y).sum() / w.sum()

6.95
6.95
6.95
6.95


6.95

### Harmonic Mean

In [21]:
hmean = len(x) / sum(1 / item for item in x)
hmean

hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

### Geometric Mean

In [22]:
gmean = 1

for item in x:
    gmean *= item

gmean **= 1 / len(x)
gmean

4.677885674856041

### Median

In [23]:
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])

median_

4

### Mode

In [26]:
u = [2, 3, 2, 8, 12]

v = [12, 15, 12, 15, 21, 15, 12]

mode_ = max((u.count(item), item) for item in set(u))[1]  #why you need set method
mode_

2

## Measure of Variability

### Variance

In [34]:
# Using pure python
n = len(x)

mean_ = sum(x) / n

var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

# Using statistics 
var_ = statistics.variance(x)
var_

# Using numpy
var_ = np.var(y, ddof=1)
var_

123.19999999999999

### Standard Deviation

In [38]:
# Using pure python
std_ = var_ ** 0.5
std_

# Using statistics
std_ = statistics.stdev(x)
std_

# Using numpy
np.std(y, ddof=1)

# Using pandas
z.std(ddof=1)

11.099549540409285

### Skewness

In [44]:
# Using pure python
x = [8.0, 1, 2.5, 4, 28.0]

n = len(x)

mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_

# Using sample skewness
y, y_with_nan = np.array(x), np.array(x_with_nan)

scipy.stats.skew(y, bias=False) 

# Using pandas
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

z.skew()


1.9470432273905924

## Summary of Descriptive Statistics

In [48]:
# Using scipy
result = scipy.stats.describe(y, ddof=1, bias=False)
result

# Using panda
result = z.describe()
result

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

## Measure of Correlation Between Pairs Data

### Covariance

In [49]:
# Creating variable

x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

In [51]:
# Using python pure
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n
cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n))/ (n - 1))
cov_xy

# Using numpy
cov_matrix = np.cov(x_, y_)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

## Correlation Coeficient

In [54]:
var_x = sum((item - mean_x)**2 for item in x) / (n - 1)
var_y = sum((item - mean_y)**2 for item in y) / (n - 1)
std_x, std_y = var_x ** 0.5, var_y ** 0.5
r = cov_xy / (std_x * std_y)
r

0.861950005631606