# Univariate and Multivariate Statistics

Based on Chapter 5 and 6 from Larose and Larose

2/2/2019 - Jeff Smith

In [None]:
# Standard Setup
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
np.__version__, pd.__version__

In [None]:
# Normally distributed random variates
nrvgs = np.random.normal(225, 47, 3000)

In [None]:
pd.DataFrame(nrvgs).describe()

In [None]:
# Churn Dataset
churn = pd.read_csv("../data/churn.txt")
print ("The churn dataset has {:,d} records with {:,d} variables each.".format(
    len(churn), len(churn.columns)))

In [None]:
churn.describe()

In [None]:
# Proportion of churners
test = churn.groupby('Churn')[['Churn']].aggregate(['count'])
test.apply(lambda x: 100*x/float(x.sum()))

In [None]:
# Arrests Dataset
arrests = pd.read_csv("../data/arrests.csv", index_col=0)
print ("The arrests dataset has {:,d} records with {:,d} variables each.".format(
    len(arrests), len(arrests.columns)))

In [None]:
arrests.describe()

In [None]:
# Baseball Dataset
baseball = pd.read_csv("../data/baseball.csv", index_col=0)
print ("The baseball dataset has {:,d} records with {:,d} variables each.".format(
    len(baseball), len(baseball)))

In [None]:
baseball.describe()

In [None]:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [None]:
# 1-sample t-test on the random variate sample
t, p = scipy.stats.ttest_1samp(nrvgs, 225)
t, p

In [None]:
x, lcl, ucl = mean_confidence_interval(nrvgs)
x, lcl, ucl

In [None]:
# Example - the baseball dataset
# H_0: mean number of hits = 65
t, p = scipy.stats.ttest_1samp(baseball.hits, 65)
t, p

In [None]:
x, lcl, ucl = mean_confidence_interval(baseball.hits)
x, lcl, ucl

In [None]:
# should be close to .05 (alpha from the CI)
t, p = scipy.stats.ttest_1samp(baseball.hits, 68.293321)
t, p

In [None]:
# should be close to .05 (alpha from the CI)
t, p = scipy.stats.ttest_1samp(baseball.hits, 55.6825)
t, p

In [None]:
#
# Testing H0: Churn proportion =.1449
#
p = len(churn[churn.Churn == 'True.'])/len(churn)
zdata = (p-.15)/np.sqrt(.15*.85/len(churn))
pval = 2*scipy.stats.norm.cdf(zdata)
zdata, pval

In [None]:
#
# Splitting the churn dataset into a "training" and a "test" set
# Use .75 for training, .25 for test
mask = np.random.rand(len(churn)) < 0.75
churn_train = churn[mask]
churn_test = churn[~mask]
len(churn_train), len(churn_test)

In [None]:
# Two-sample t-test - Try this (split the data + t-test) several times -- note the variation in the 
# t, p values.
t, p = scipy.stats.ttest_ind(churn_train.CustServCalls, churn_test.CustServCalls, equal_var=False)
t, p

In [None]:
#
# ANOVA from Sect. 6.5
#
A = [30, 40, 50, 60]
B = [25, 30, 50, 55]
C = [25, 30, 40, 45]
scipy.stats.f_oneway(A, B, C)

In [None]:
D = [43, 45, 45, 47]
E = [37, 40, 40, 43]
F = [34, 35, 35, 36]
scipy.stats.f_oneway(D, E, F)

In [None]:
#
# Example 11-1 from Hines & Montgomery
#
A = [ 7,  7, 15, 11,  9]
B = [12, 17, 12, 18, 18]
C = [14, 18, 18, 19, 19]
D = [19, 25, 22, 19, 23]
E = [ 7, 10, 11, 15, 11]
scipy.stats.f_oneway(A, B, C, D, E)

In [None]:
churn.Churn == 'True.'