# Challenges from "10 Days of Statistics" on HackerRank.com

In [None]:
from math import erf

## Mean, Median and Mode

Simple but important values that provides an overview from a given data.

In [None]:
# input data
input_data = '64630 11735 14216 99233 14470 4978 73429 38120 51135 67060 67060'

In [None]:
# splitting data into a list, and converting each to float type
input_list = input_data.split()
input_list = [ float(i) for i in input_list ]
input_list = sorted(input_list)

In [None]:
# defining a mean function
def mean(given_data):
  mean = 0
  for c in range( len(given_data) ):
    mean += given_data[c]
  mean /= len(given_data)
  return mean

In [None]:
# defining a median function
def median(given_data):
  median = 0
  if len(given_data) % 2 != 0:
    median = given_data[int(( len(given_data) - 1) / 2)]
  else:
    median = ( given_data[int(( len(given_data) - 1) / 2)] + given_data[int( len(given_data) / 2)] ) / 2
  return median

In [None]:
# defining a mode function
# returns a list of 3 values:
#   - mode: if existent, else returns 'null'
#   - mode_counter: how many times the mode repeats on the set, 'null' if there is no mode
#   - mode_multi: 1 if there is a mode, more than 1 if there are multiple modes, otherwise 'null'
def mode(given_data):
  mode = given_data[0]
  mode_counter = given_data.count(given_data[0])
  mode_multi = 1
  for c in range(1, len(given_data) ):
    if given_data.count(given_data[c]) > mode_counter:
      mode = given_data[c]
      mode_counter = given_data.count(given_data[c])
      mode_multi = 1
    elif given_data.count(given_data[c]) == mode_counter:
      mode_multi += 1
  if mode_counter == 1:
    mode = '--null--'
    mode_counter = '--null--'
    mode_multi = '--null--'
  else:
    mode_multi = int(mode_multi ** 0.5)
    if mode_multi > 1:
      mode = '--multiple--'
  return [mode, mode_counter, mode_multi]

In [None]:
print('Mean is equal to {}'.format( round( mean(input_list), 1)))
print('Median is equal to {}'.format( round( median(input_list), 1)))
mode_info = mode(input_list)
print('There is(are) {} mode(s). Mode is equal to {} and this value repeats {} times'.format(mode_info[2], mode_info[0], mode_info[1]))

Mean is equal to 46006.0
Median is equal to 51135.0
There is(are) 1 mode(s). Mode is equal to 67060.0 and this value repeats 2 times


## Weighted Mean

When some values contribute more than others on the calculus of the average.

In [None]:
def weighted_mean(X, W):
  wm1 = wm2 = 0
  for c in range ( len( X )):
    wm1 += X[c] * W[c]
    wm2 += W[c]
  return ( wm1 / wm2 )

In [None]:
X = [10, 40, 30, 50, 20]
W = [1, 2, 3, 4, 5]
print('The weighted mean for this data is equal to {}'.format( round( weighted_mean(X, W) , 1) ))

The weighted mean for this data is equal to 32.0


## Standard Deviation

A measure of the amount of dispersion on the values.

In [None]:
def stdDev(arr):
  mean = sum(arr)/len(arr)
  var = 0
  for c in range( len(arr) ):
    var += (arr[c] - mean) ** 2
  var /= len(arr)
  return ( var**0.5 )

In [None]:
devi = stdDev(input_list)

print('Standard deviation is equal to {}'.format( round(devi, 1) ))

Standard deviation is equal to 29802.3


## Assuming a normal distribution, finding the % of the data that is:

*   Higher than 30,000
*   Higher than 40,000
*   Higher than 50,000

In [None]:
higher_30 = 1 - (1 + erf((30000 - mean(input_list) )/( stdDev(input_list) * (2**0.5)))) * 0.5
higher_40 = 1 - (1 + erf((40000 - mean(input_list) )/( stdDev(input_list) * (2**0.5)))) * 0.5
higher_50 = 1 - (1 + erf((50000 - mean(input_list) )/( stdDev(input_list) * (2**0.5)))) * 0.5

print('{}% of the data is higher than 30,000'.format( round(higher_30 * 100, 2) ))
print('{}% of the data is higher than 40,000'.format( round(higher_40 * 100, 2) ))
print('{}% of the data is higher than 50,000'.format( round(higher_50 * 100, 2) ))

70.44% of the data is higher than 30,000
57.99% of the data is higher than 40,000
44.67% of the data is higher than 50,000


## Covariance

A measure of the joint variability of two variables.

In [None]:
def covariance(data_a, data_y):
  n = len(data_a)
  mean_a = mean_b = 0
  for c in range(n):
    mean_a += data_a[c]
    mean_b += data_b[c]
  mean_a /= n
  mean_b /= n
  sum = 0
  for c in range(n):
    sum += (data_a[c] - mean_a) * (data_b[c] - mean_b)
  cov = sum / n
  return cov

In [None]:
data_a = [10, 9.8, 8, 7.8, 7.7, 7, 6, 5, 4, 2]
data_b = [200, 44, 32, 24, 22, 17, 15, 12, 8, 4]

print('Covariance is equal to {}'.format( round( covariance(data_a, data_b), 1) ))

Covariance is equal to 80.9


## Pearson correlation coefficient

A measure of a linear correlation between to sets of data.
It has a value that ranges from -1 and +1, where 1 represent a perfect correlation, -1 a perfect negative correlation, and 0 represents no correlation at all.

In [None]:
def pearson(data_a, data_y):
  n = len(data_a)
  mean_a = mean_b = 0
  for c in range(n):
    mean_a += data_a[c]
    mean_b += data_b[c]
  mean_a /= n
  mean_b /= n
  sum = 0
  for c in range(n):
    sum += (data_a[c] - mean_a) * (data_b[c] - mean_b)
  cov = sum / n
  devi_a = devi_b = 0
  for c in range(n):
    devi_a += (data_a[c] - mean_a)**2
    devi_b += (data_b[c] - mean_b)**2
  devi_a = (devi_a / n)**0.5
  devi_b = (devi_b / n)**0.5
  p = cov / (devi_a * devi_b)
  return p

In [None]:
data_a = [10, 9.8, 8, 7.8, 7.7, 1.7, 6, 5, 1.4, 2]
data_b = [200, 44, 32, 24, 22, 17, 15, 12, 8, 4]

print('The Pearson Correlation Coeficient is equal to {}'.format( round( pearson(data_a, data_b) , 3) ))

The Pearson Correlation Coeficient is equal to 0.579


## Spearman's Rank Correlation

A measure of a rank correlation between two variables. Defines how well the relationship of the two variables can be described with a monotonic function, whether it is linear or not. 

In [None]:
def spearman(data_a, data_b):
  n = len(data_a)
  rank_a = dict((x, i+1) for i, x in enumerate( sorted( set(data_a) )))
  rank_b = dict((y, i+1) for i, y in enumerate( sorted( set(data_b) )))
  rank_a = [rank_a[x] for x in data_a]
  rank_b = [rank_b[y] for y in data_b]
  s = 0
  if max(rank_a) == n and max(rank_b) == n:
    for c in range(n):
      s += (rank_a[c] - rank_b[c])**2
    spearman = 1 - ((6*s) / (n*(n**2 - 1)))
  else:
    mean_a = sum(rank_a) / n
    mean_b = sum(rank_b) / n
    devi_a = devi_b = 0
    for c in range(n):
        devi_a += (rank_a[c] - mean_a)**2
        devi_b += (rank_b[c] - mean_b)**2
    devi_a = (devi_a / n)**0.5
    devi_b = (devi_b / n)**0.5
    s = 0
    for c in range(n):
        s += (rank_a - mean_a) * (rank_b - mean_b)
    cov = s / n
    spearman = cov / (devi_a * devi_b)
  return spearman

In [None]:
data_a = [10, 9.8, 8, 7.8, 7.7, 1.7, 6, 5, 1.4, 2]
data_b = [200, 44, 32, 24, 22, 17, 15, 12, 8, 4]

print("The Spearman's Rank correlation coeficient is equal to {}".format( round( spearman(data_a, data_b) , 3) ))

The Spearman's Rank correlation coeficient is equal to 0.903
