# Univariate and Multivariate Statistics

Based on Chapter 5 and 6 from Larose and Larose

2/2/2019 - Jeff Smith

In [None]:
# Standard Setup
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.__version__, pd.__version__

In [None]:
# Normally distributed random variates
nrvgs = np.random.normal(225, 47, 3000)

In [None]:
pd.DataFrame(nrvgs).describe()

In [None]:
# Churn Dataset
churn = pd.read_csv("../data/churn.txt")
print ("The churn dataset has {:,d} records with {:,d} variables each.".format(
    len(churn), len(churn.columns)))

In [None]:
churn.describe()

In [None]:
# Proportion of churners
test = churn.groupby('Churn')[['Churn']].aggregate(['count'])
test.apply(lambda x: 100*x/float(x.sum()))

In [None]:
# Arrests Dataset
arrests = pd.read_csv("../data/arrests.csv", index_col=0)
print ("The arrests dataset has {:,d} records with {:,d} variables each.".format(
    len(arrests), len(arrests.columns)))

In [None]:
arrests.describe()

In [None]:
# Baseball Dataset
baseball = pd.read_csv("../data/baseball.csv", index_col=0)
print ("The baseball dataset has {:,d} records with {:,d} variables each.".format(
    len(baseball), len(baseball)))

In [None]:
baseball.describe()

In [None]:
import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [None]:
# 1-sample t-test on the random variate sample
t, p = scipy.stats.ttest_1samp(nrvgs, 225)
t, p

In [None]:
x, lcl, ucl = mean_confidence_interval(nrvgs)
x, lcl, ucl

In [None]:
# Example - the baseball dataset
# H_0: mean number of hits = 65
t, p = scipy.stats.ttest_1samp(baseball.hits, 65)
t, p

In [None]:
x, lcl, ucl = mean_confidence_interval(baseball.hits)
x, lcl, ucl

In [None]:
# should be close to .05 (alpha from the CI)
t, p = scipy.stats.ttest_1samp(baseball.hits, 68.293321)
t, p