# Statistics II

## Inferential statistics

The goal of descriptive statistics is to gain sufficient insight into our data to enable us to develop hypotheses about what process generated the data. These hypotheses are called **null models**. 

The process can be a physical model, which may or may not predict the value of the parameters necessary to describe the statistical properties of the data. The process can also be a statistical model.

Using inferential statistics, one determines whether a given null hypothesis is consistent with the data.

In [None]:
import matplotlib.pyplot as plt
import math
import numpy as np
import scipy.stats as stat

In [None]:
# As you saw previously, this data has lots of mistakes.  For simplicity, here we are only going to consider
# data that has no issues and for which the maximum GPA is 4.0.

from path import Path

folder = Path('/Users/amaral/Desktop/BoxSync/Intro_Data_Science/')
folder.chdir()

with open('Data/gpa_data.csv', "r") as data_file:
    all_lines = data_file.readlines()

gpa_list = []
for line in all_lines[1:]:
    gpa_list.append( line.strip().split(",") )
    
gpa = []
for x in gpa_list[1:]:
    try:
        if (float(x[1]) == 4.):
            try:
                gpa.append(float(x[0]))
            except:
                pass
    except:
        pass
    
def half_frame(sub, xaxis_label, yaxis_label, font_size = 15, padding = -0.02):
    """Formats frame, axes, and ticks for matplotlib made graphic with half frame."""

    # Format graph frame and tick marks
    sub.yaxis.set_ticks_position('left')
    sub.xaxis.set_ticks_position('bottom')
    sub.tick_params(axis = 'both', which = 'major', length = 7, width = 2, direction = 'out', pad = 10,
                    labelsize = font_size)
    sub.tick_params(axis = 'both', which = 'minor', length = 5, width = 2, direction = 'out', labelsize = 10)
    for axis in ['bottom','left']:
        sub.spines[axis].set_linewidth(2)
        sub.spines[axis].set_position(("axes", padding))
    for axis in ['top','right']:
        sub.spines[axis].set_visible(False)

    # Format axes
    sub.set_xlabel(xaxis_label, fontsize = 1.6 * font_size)
    sub.set_ylabel(yaxis_label, fontsize = 1.6 * font_size)

In [None]:
fig = plt.figure( figsize = (6, 4.5) )
sub1 = fig.add_subplot(1,1,1)
my_font_size = 15
half_frame(sub1, "GPA", "Probability density", font_size = my_font_size)

# Calculate and plot histogram
sub1.hist(gpa, 25, normed = 1, rwidth = 0.75, color = "g", alpha = 0.5, histtype = "bar", 
          label = "data", cumulative = False)

## Gaussian null models
mu = np.mean(gpa)
sigma = np.std(gpa)
x3 = np.linspace(stat.norm.ppf(0.0001, loc = mu, scale = sigma), 
                 min(4, stat.norm.ppf(0.9999, loc = mu, scale = sigma)), 
                 100)
rv3 = stat.norm(loc = mu, scale = sigma)
rv4 = stat.norm(loc = float(stat.mode(gpa)[0]) - 0.1, 
                scale = sigma)

sub1.plot(x3, rv3.pdf(x3), color = "b", lw = 2, label= "Gaussian")
sub1.plot(x3, rv4.pdf(x3), color = "r", lw = 2, label= "Gaussian")

# Format legend
sub1.legend(loc = "best", frameon = False, markerscale = 1.8, fontsize = my_font_size)

plt.show()

In [None]:
mu_f, std_f = stat.norm.fit(gpa)
print(mu_f,std_f)

In [None]:
fig = plt.figure( figsize = (6, 4.5) )
sub1 = fig.add_subplot(1,1,1)
my_font_size = 15
half_frame(sub1, "GPA", "Probability density", font_size = my_font_size)

# Calculate and plot histogram
sub1.hist(gpa, 25, normed = 1, rwidth = 0.75, color = "g", alpha = 0.5, histtype = "bar", 
          label = "data", cumulative = False)

## Gaussian null models
mu = mu_f
sigma = std_f
x3 = np.linspace(stat.norm.ppf(0.0001, loc = mu, scale = sigma), 
                 min(4, stat.norm.ppf(0.9999, loc = mu, scale = sigma)), 
                 100)
rv3 = stat.norm(loc = mu, scale = sigma)

sub1.plot(x3, rv3.pdf(x3), color = "b", lw = 2, label= "Gaussian")

# Format legend
sub1.legend(loc = "best", frameon = False, markerscale = 1.8, fontsize = my_font_size)

plt.show()