In [2]:
# Project: use different statistical testing to validate hypotheses 
# FetchMaker’s mission is to match up prospective dog owners with their perfect pet. 

In [27]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp, binom_test, f_oneway, chi2_contingency
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [28]:
dogs = pd.read_csv("Micro project in Python- Hypothesis Testing SciPy_Dataset.csv")
#print(dogs)

In [29]:
# Define the function to retrieve dog attributes 
def get_attribute(breed, attribute):
  if breed in dogs.breed.unique():
    if attribute in dogs.columns:
        return dogs[dogs["breed"] == breed][attribute]
    else:
      raise NameError('Attribute {} does not exist.'.format(attribute))
  else:
    raise NameError('Breed {} does not exist.'.format(breed))

In [30]:
# Define the sub-functions to retrieve different attributes 
def get_weight(breed):
  return get_attribute(breed, 'weight')
  
def get_tail_length(breed):
  return get_attribute(breed, 'tail_length')

def get_color(breed):
    return get_attribute(breed, 'color')

def get_age(breed):
    return get_attribute(breed, 'age')

def get_is_rescue(breed):
    return get_attribute(breed, 'is_rescue')

def get_likes_children(breed):
    return get_attribute(breed, 'likes_children')

def get_is_hypoallergenic(breed):
    return get_attribute(breed, "is_hypoallergenic")

def get_name(breed):
    return get_attribute(breed, "name")

In [31]:
# 1 Sample T-Testing: compares a sample mean to a hypothetical population mean. 
# It answers the question “What is the probability that the sample came from a distribution with the desired mean?”

# we know the average tail length for rottweiler bread is 4cm. We want to compares our sample mean to a hypothetical population mean (4cm). 
# The null hypothesis states that there is no significant difference: “The set of samples belongs to a population with the target mean”. 
# The alternative hypothesis assumes that some difference exists between two means.

# Interpretation: Statistical significance is determined by looking at the p-value. 
# The p-value gives the probability of observing the test results under the null hypothesis. 
# The lower the p-value, the lower the probability of obtaining a result like the one that was observed if the null hypothesis was true. 
# Thus, a low p-value indicates decreased support for the null hypothesis. However, the possibility that the null hypothesis is true and that we simply obtained a very rare result can never be ruled out completely. 
# The cutoff value for determining statistical significance is usually a value of .05 or less is chosen. 
# This corresponds to a 5% (or less) chance of obtaining a result like the one that was observed if the null hypothesis was true.


In [36]:
# 1 Sample T-Testing
rottweiler_tl = get_tail_length("rottweiler")
#print rottweiler_tl
r_m = np.mean(rottweiler_tl)
#print r_m
tval, pval = ttest_1samp(rottweiler_tl, 4)
print(pval)
#print(np.std(rottweiler_tl))
#print("P-value")
print("1 Sample T-Testing:") 
print("P-value is {}. Therefore, the null hypothesis is likely true. In other words, there is no significant difference: “The set of samples belongs to a population with the target mean.” ".format(round(pval,2)))

0.2579725931822169
1 Sample T-Testing:
P-value is 0.26. Therefore, the null hypothesis is likely true. In other words, there is no significant difference: “The set of samples belongs to a population with the target mean.” 
