### Topics covered:
- stat operations: mean, median, min, max, std_dev, var, percentile, corrcoef, cov


In [17]:
import numpy as np

# STATS 

In [37]:
# Lets calculate descriptive stats on a data

arr = np.array([12, 15, 14, 13, 10, 22, 25, 35, 19, 18, 17, 23]) # Monthly sales data in 1000s
print("arr:",arr)

# Calculate statistics
mean_value = np.mean(arr) 
print("Mean:", mean_value)

median_value = np.median(arr)
print("Median:", median_value) # Half the months are above/below this.

min_value = np.min(arr)
print("Minimum:", min_value)

arg_min = np.argmin(arr)
print("arg_min:", arg_min)

max_value = np.max(arr)
print("Maximum:", max_value)

arg_max = np.argmax(arr)
print("arg_max:", arg_max)

percentile_25 = np.percentile(arr, 25)
print("25th Percentile:", percentile_25)

percentile_75 = np.percentile(arr, 75)
print("75th Percentile:", percentile_75)

arr: [12 15 14 13 10 22 25 35 19 18 17 23]
Mean: 18.583333333333332
Median: 17.5
Minimum: 10
arg_min: 4
Maximum: 35
arg_max: 7
25th Percentile: 13.75
75th Percentile: 22.25


In [20]:
# mode: Numpy does not have in-built function for mode. 
# But we can compute it using a combination of NumPy functions.

data = np.array([1, 2, 2, 3, 4, 4, 4, 5, 2, 3, 3, 5, 2, 3, 2])

values, counts = np.unique(data, return_counts=True)
mode_value = values[np.argmax(counts)]

print("Mode:", mode_value) 

Mode: 2


In [31]:
# Lets understand variance and standard deviation through a data with different variance
data1 = np.array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])   # no variance
data2 = np.array([4, 3, 5, 6, 7, 3, 5, 6, 4, 5])   # some variance
data3 = np.array([1, 1, 0, 0, 5, 9, 9, 10, 10, 5]) # large variance

var1 = np.var(data1)
sd1 = np.std(data1)
print(data1)
print("Variance1:", var1, "; Standard Deviation1:", sd1, "\n")

var2 = np.var(data2)
sd2 = np.std(data2)
print(data2)
print("Variance2:", var2, "; Standard Deviation2:", sd2, "\n")
 
var3 = np.var(data3)
sd3 = np.std(data3)
print(data3)
print("Variance3:", var3, "; Standard Deviation3:", sd3)

[5 5 5 5 5 5 5 5 5 5]
Variance1: 0.0 ; Standard Deviation1: 0.0 

[4 3 5 6 7 3 5 6 4 5]
Variance2: 1.56 ; Standard Deviation2: 1.2489995996796797 

[ 1  1  0  0  5  9  9 10 10  5]
Variance3: 16.4 ; Standard Deviation3: 4.049691346263317


In [34]:
# Application1 : finding relation between 2 variable

x1 = np.array([1,  2,  3,  4,  5,  3,  4])  # game_hours
x2 = np.array([90, 90, 85, 80, 60, 80, 70]) # exam_scores

# compute covariance matrix
cov_matrix = np.cov([x1, x2])

print("Covariance matrix:\n", cov_matrix)
print(f"Covariance between StudyHours and ExamScore: {cov_matrix[0,1]}")

corr_matrix = np.corrcoef([x1, x2]) # always between -1 to +1
print("\ncorr_matrix:\n", corr_matrix)

Covariance matrix:
 [[  1.80952381 -13.21428571]
 [-13.21428571 120.23809524]]
Covariance between StudyHours and ExamScore: -13.214285714285715

corr_matrix:
 [[ 1.         -0.89586073]
 [-0.89586073  1.        ]]


In [23]:
# Application: finding relation among 4 variable

x1 = np.array([10, 20, 30, 40, 50])  # resistance with I=2
x2 = np.array([20, 40, 60, 80, 100]) # voltage measured via a top quality voltmeter
x3 = np.array([18, 40, 50, 20,  30]) # voltage measured via a sub quality voltmeter
x4 = np.array([90, 82, 55, 40, 20]) #  some other var with -ve correlation

# stack and compute covariance
X = np.array([x1, x2, x3, x4])
cov_matrix = np.cov(X)
print("Covariance matrix:\n", cov_matrix)

corr_matrix = np.corrcoef(X) # always between -1 to +1
print("\ncorr_matrix:\n", corr_matrix)

Covariance matrix:
 [[ 250.   500.    10.  -455. ]
 [ 500.  1000.    20.  -910. ]
 [  10.    20.   182.8   -4.8]
 [-455.  -910.    -4.8  843.8]]

corr_matrix:
 [[ 1.          1.          0.04677803 -0.99065317]
 [ 1.          1.          0.04677803 -0.99065317]
 [ 0.04677803  0.04677803  1.         -0.01222175]
 [-0.99065317 -0.99065317 -0.01222175  1.        ]]


# STOP

## package stat and its methods

In [24]:
# SKIP
from scipy.stats import mode # NumPy does not have a built-in mode()

# how to calculate mode: the most frequent item
# Use Cases:
# Product reviews:	Find the most common rating
# Voting systems:	Determine the most popular choice
# Health surveys:	Identify most frequently reported symptom
# Transport:	Find the most common route or time taken

arr = np.array([1, 2, 2, 3, 4, 4, 4, 5, 5, 4, 4]) # rating for a certain  product

mode_result = mode(arr, keepdims=True)
print("Mode:", mode_result.mode[0]) # most common result
print("Count:", mode_result.count[0])

Mode: 4
Count: 5
