In [77]:
# Import libraries
import numpy as np
import pandas as pd
from scipy.stats import binomtest, f_oneway, chi2_contingency
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Import data
dogs = pd.read_csv('dog_data.csv')

In [78]:
dogs.head()

Unnamed: 0,is_rescue,weight,tail_length,age,color,likes_children,is_hypoallergenic,name,breed
0,0,6,2.25,2,black,1,0,Huey,chihuahua
1,0,4,5.36,4,black,0,0,Cherish,chihuahua
2,0,7,3.63,3,black,0,1,Becka,chihuahua
3,0,5,0.19,2,black,0,0,Addie,chihuahua
4,0,5,0.37,1,black,1,1,Beverlee,chihuahua


In [79]:
estimated_rescued_pct = 0.08

In [80]:
num_whipped_rescues = len(dogs[(dogs['breed'] == 'whippet') & (dogs['is_rescue'] == 1)])
print(num_whipped_rescues)

6


In [81]:
whippet_rescue = dogs.is_rescue[dogs.breed == 'whippet']
whippet_rescue.head()

700    0
701    0
702    0
703    0
704    0
Name: is_rescue, dtype: int64

How many whippets are rescues (remember that the value of is_rescue is 1 for rescues and 0 otherwise)? Save this number as num_whippet_rescues and print it out.

In [82]:
num_whipped_rescues = np.sum(whippet_rescue == 1)
print(num_whipped_rescues)

#total number of whippets in sample data
num_whippets = len(whippet_rescue)
print(num_whippets)

6
100


Use a hypothesis test to test the following null and alternative hypotheses:

Null: 8% of whippets are rescues
Alternative: more or less than 8% of whippets are rescues
Save the p-value from this test as pval and print it out. Using a significance threshold of 0.05, Is the proportion of whippets who are rescues significantly different from 8%?

In [83]:
pval = binomtest(num_whipped_rescues, num_whippets, .08)
print(pval)

BinomTestResult(k=6, n=100, alternative='two-sided', statistic=0.06, pvalue=0.5811780106238105)


## Mid-Sized Dog Weights

In [84]:
# Subset to just whippets, terriers, and pitbulls
dogs_wtp = dogs[dogs.breed.isin(['whippet', 'terrier', 'pitbull'])]

# Get the weights of the individaul breeds
wt_whippets = dogs_wtp.weight[dogs_wtp.breed == 'whippet']
wt_terriers = dogs_wtp.weight[dogs_wtp.breed == 'terrier']
wt_pitbulls = dogs_wtp.weight[dogs_wtp.breed == 'pitbull']

dogs_wtp.head()


Unnamed: 0,is_rescue,weight,tail_length,age,color,likes_children,is_hypoallergenic,name,breed
200,0,71,5.74,4,black,0,0,Charlot,pitbull
201,0,26,11.56,3,black,0,0,Jud,pitbull
202,0,56,10.76,4,black,0,0,Rosamund,pitbull
203,0,33,6.32,4,black,1,0,Ruthann,pitbull
204,0,54,17.18,4,black,1,1,Bryon,pitbull



Run a single hypothesis test to address the following null and alternative hypotheses:

Null: whippets, terriers, and pitbulls all weigh the same amount on average  
Alternative: whippets, terriers, and pitbulls do not all weigh the same amount on average (at least one pair of breeds has differing average weights)  

Save the resulting p-value as pval and print it out. Using a significance threshold of 0.05, is there at least one pair of dog breeds that have significantly different average weights?

In [85]:
fstat, pval = f_oneway(wt_pitbulls, wt_terriers, wt_whippets)
print(pval)

3.276415588274815e-17


In [86]:
tukey_results = pairwise_tukeyhsd(dogs_wtp.weight, dogs_wtp.breed, 0.05)
print(tukey_results)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower    upper  reject
-------------------------------------------------------
pitbull terrier   -13.24    0.0 -16.7278 -9.7522   True
pitbull whippet    -3.34 0.0638  -6.8278  0.1478  False
terrier whippet      9.9    0.0   6.4122 13.3878   True
-------------------------------------------------------


>The pitbulls and whippets significantly differ in their weights

## Poodle and Shihtzu Colors

In [87]:
# Subset to just poodles and shihtzus
dogs_ps = dogs[dogs.breed.isin(['poodle', 'shihtzu'])]
dogs_ps.head()

Unnamed: 0,is_rescue,weight,tail_length,age,color,likes_children,is_hypoallergenic,name,breed
300,0,58,8.05,1,black,1,0,Moise,poodle
301,0,56,9.44,4,black,1,0,Boote,poodle
302,1,59,4.04,4,black,1,0,Beatrix,poodle
303,0,70,12.37,1,black,1,0,Rabbi,poodle
304,0,52,11.42,2,black,0,0,Tallou,poodle


In [88]:
xtab = pd.crosstab(dogs_ps.breed, dogs_ps.color)
xtab

color,black,brown,gold,grey,white
breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
poodle,17,13,8,52,10
shihtzu,10,36,6,41,7


Run a hypothesis test for the following null and alternative hypotheses:

Null: There is an association between breed (poodle vs. shihtzu) and color.  
Alternative: There is not an association between breed (poodle vs. shihtzu) and color.  

Save the p-value as pval and print it out. Do poodles and shihtzus come in significantly different color combinations? Use a significance threshold of 0.05.

In [92]:
chi2, pval, dof, exp = chi2_contingency(xtab)
print(pval)

0.005302408293244597


> Reject the null hypothesis