In [None]:
import statistics as stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Statistics describes an individual variable or the relationships among two or more variables
# A variable represents information about a particular measurable concept (temperature, price, size, etc)
# Each measurement within a variable is called a datapoint

In [None]:
# Make a dataframe with one variable, age

# initialize empty DataFram
df = pd.DataFrame()

# populate the 'age' column
df['age'] = [28, 42, 27, 24, 35, 54, 35, 37]

In [None]:
# When describing variables, two characteristics of most interest are the central tendency and the variance
# central tendency describes a point around which datapoints in a variable cluster
# Central tendency can be measured in a number of ways. The most common measures are the mean, the median, and the mode

In [None]:
# Mean, median and mode calculated from a sample are considered unbiased estimates of the population
# An estimate is "unbiased" if, across multiple representative samples, the sample estimates converge on the population value
# A "biased" estimate would converge on a value that was either higher or lower than the population value

# Unbiased estimates let us use a small group of observations to make generalizations about a much larger group

In [None]:
# Mean represents the average value within a variable
# it's computed as the sum of the individual datapoints in a variable x divided by the total number of values in a variable n
# It is sometimes also referred to as the "expected value" of a variable

# calculate the mean
def mean_func(x):
    n = len(x)
    mean_val = sum(x) / n
    return mean_val

print(mean_func([10, 30, 45, 30, 50]))

# or use numpy's built in functionality
np_mean = np.mean([10, 30, 45, 30, 50])
print(np_mean)

In [None]:
# Here are two ways you can compute the mean of our age data, first with built-in Python functionality and then with NumPy.

# Using built-in Python functionality.
sum(df['age']) / len(df['age'])

# Using NumPy
np.mean(df['age'])

In [None]:
# Median is the middle value in a variable when the values are ordered from least to greatest
# Median, like the mean, is easy to understand, and has the added benefit that it isn't sensitive to extreme values and outliers

#Here's how you can compute the median of our age data using the statistics module of the Python standard library or NumPy.

# Vanilla Python, using the built-in statistics module.
import statistics

statistics.median(df['age'])

# Using NumPy.
np.median(df['age'])

In [None]:
# Mode represents the value in a variable that occurs the most frequently.

# Return the mode using the statistics module.
import statistics
statistics.mode(df['age'])

In [None]:
# Note the code above will raise a StatisticsError if you run it on data containing multiple modes
# Receiving this error or inspecting a list of counts beforehand, will show whether there is more than one mode to look for

# Generate a list of unique elements along with how often they occur.
# use numpy's .unique() function and pass return_counts=True
(values, counts) = np.unique(df['age'], return_counts=True)

# The location in the values list of the most-frequently-occurring element.
# use numpy's .argmax() function to find the value with the maximum count, or highst frequency
ind = np.argmax(counts)


# The most frequent element.
values[ind]

# The code above will handle data with multiple modes without raising an exception, but you'll get back just the first mode

In [None]:
# push your understanding of Python you can challenge yourself to revise it to give you all of the modes



In [None]:
# variance of a variable describes how much values differ from the central tendency and how much they differ from each other
# If all the values in a variable are close to the central tendency, then variance is said to be low
# If values in a variable vary widely, with some far away from the central tendency, variance is said to be high

# Another way to think of variance is that it gives a clue to how valuable each individual datapoint is within a variable
# If variance is low and most datapoints are similar to the central tendency,
# then each datapoint provides little new information about the concept being measured.
# If variance is high, then each datapoint is more likely to provide unique information about the concept being measured.

In [None]:
# Variance is measured as the sum of the squared difference of each individual datapoint from the mean,
# divided by the number of datapoints minus 1: v = sum((x - mean) ** 2) / (n - 1)

# Sample variance is divide by n - 1 because dividing by n would underestimate the population variance and creating bias

# calculate the variance
def var_func(x):
    n = len(x)
    mean_x= sum(x) / n
    var = sum((x - mean_x) ** 2) / (n - 1 )
    return var

print(var_func(df['age']))

v = sum((df['age'] - np.mean(df['age'])) ** 2) / (len(df['age']) - 1)
print(v)

# we can also use numpy's variance function:
np.var(df['age'])

# or pandas built in syntax
np.var(df.age)







"""WHY DOES VARIANCE FUNCTION NOT EQUAL PANDAS AND NUMPY VARIANCE FUNCTION"""

In [None]:
# The most common estimate of variability used by statisticians is the square root of the variance, called the standard deviation

# Calculate the standard deviation: s = v ** 0.5
def stand_dev(x):
    n = len(x)
    mean_x = sum(x) / n
    var = sum((x - mean_x) ** 2) / (n - 1 )
    st_dv = var ** 0.5
    return st_dv

stand_dev(df['age'])

# or use NumPy's built in function np.std()
# A tricky default in numpy is to calculate the population standard deviation, not the sample standard deviation
# To calculate the sample standard deviation, set the "delta degrees of freedom" with the ddof=1 parameter
np.std(df['age'], ddof=1)

In [None]:
# Another useful estimate of variance is the standard error, which quantifies uncertainty in the estimate of the sample mean
# Standard error tells us about the precision of our sample mean estimate, also called "margin of error".

# standard error is the standard deviation of the sample divided by the square root of the sample size: se = s / (n ** 0.5)

def stand_err(x):
    n = len(x)
    mean_x = sum(x) / n
    var = sum((x - mean_x) ** 2) / (n - 1 )
    st_dv = var ** 0.5
    se = st_dv / (n ** 0.5)
    return se

stand_err(df['age'])

# or use numpy:
np.std(df['age'] ,ddof=1) / np.sqrt(len(df['age']))

In [None]:
# Let's examine sampling from different distributions of low and high variance
#We'll create two variables, one with low variability and one with high variability, and see how they differ

# create an empty DataFrane
pop = pd.DataFrame()

# Then create two variables with mean = 60, one with a low standard
# deviation (sd=10) and one with a high standard deviation (sd=100).
pop['low_var'] = np.random.normal(60, 10, 10000)
pop['high_var'] = np.random.normal(60, 100, 10000)

# Finally, create histograms of the two variables.
pop.hist(layout=(2,1), sharex=True) #sharex=
plt.show()

# Calculate and print the maximum and minimum values for each variable.
print(pop.max())
print(pop.min())

# The variable with high variance has a much wider range of possible values than the variable with low variance
# If these variables represented two populations we wanted to study, we would take samples from each,
# then generalize from those samples to get information about the populations

In [None]:
# Take a random sample of 100 observations from each variable and store it in a new dataframe.
sample = pd.DataFrame()
sample['low_var'] = np.random.choice(pop['low_var'], 100)
sample['high_var'] = np.random.choice(pop['high_var'], 100)

# visualize the data
sample.hist()
plt.show()

# Check how well the sample replicates the population.
print("mean:\n" , sample.mean())
print("\nstandard deviation:\n" , sample.std(ddof=1))

In [None]:
# set up a DataFrame

data = pd.DataFrame()
data['gender'] = ['male'] * 100 + ['female'] * 100
data['height'] = np.append(np.random.normal(69, 8, 100), np.random.normal(64, 5, 100))
data['weight'] = np.append(np.random.normal(195, 25, 100), np.random.normal(166, 15, 100))

data.head()
data.tail()

In [None]:
# explore data
data.height.mean()
data.height.std()

print(data.describe())
print(data.groupby('gender').describe())

print(data.gender.value_counts())
# There are several reasons to use this method. Firstly, it gives you another way to make sense of your data.
# In this case it shows us that our data is evenly balanced between males and females, with one hundred samples of each.
# There are plenty of other ways this function could be useful. It can show outliers or possible malformed data.
# For example, if we were to see something like 'Mal' with a single entry, we'd have found a typo in the data.
# This method works over numerical and object data, though it is not valuable to run over the numeric columns in this example
