# Conduct a hypothesis test

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np

### avocado_info is a pandas DataFrame that contains information on a sample of avocado trees --- for each tree, the number of days it took to grow avocado fruit and whether it received fertilizer (True or False).

In [3]:
avocado_info2 = pd.read_csv('avocado_info2.csv')

In [4]:
avocado_info2

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,12/27/2015,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,12/20/2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,12/13/2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,12/06/2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,11/29/2015,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,02/04/2018,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,8,01/28/2018,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,01/21/2018,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,01/14/2018,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


In [5]:
list(avocado_info2.columns.values)

# Fertilizer -> type
# True -> organic
# False -> conventional
# Growth Duration -> AveragePrice

# Code for later:

"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np

avocado_info = pd.read_csv('avocado.csv')

organic = avocado_info.loc[avocado_info['type'] == 'organic']
not_organic = avocado_info.loc[avocado_info['type'] == 'conventional']

sn.distplot(organic['AveragePrice'], kde = False, label = 'Organic')
sn.distplot(not_organic['AveragePrice'], kde = False, label = 'Not Organic')
plt.legend()
plt.show()

observed_test_stat = np.mean(organic['AveragePrice']) - np.mean(not_organic['AveragePrice'])

def perm(data):
     return data.sample(frac = 1).reset_index(drop = True)
     
sim_test_stat = np.array([])
     
reps = 1000

for i in range(reps):
     perm_info = perm(avocado_info['AveragePrice'])
     df = pd.DataFrame({'Permuted Price': perm_info, 'type': avocado_info['type']})
     organic = df.loc[df['type'] == 'organic', 'Permuted Price']
     not_organic = df.loc[df['type'] == 'conventional', 'Permuted Price']
     stat = np.mean(organic) - np.mean(not_organic)
     sim_test_stat = np.append(sim_test_stat, stat)
     p_value = np.count_nonzero(sim_test_stat >= observed_test_stat)/reps
     p_value
"""

['Unnamed: 0',
 'Date',
 'AveragePrice',
 'Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'type',
 'year',
 'region']

In [6]:
organic = avocado_info2.loc[avocado_info2['type'] == 'organic']
not_organic = avocado_info2.loc[avocado_info2['type'] == 'conventional']

In [7]:
observed_test_stat = np.mean(organic['AveragePrice']) - np.mean(not_organic['AveragePrice'])

In [8]:
observed_test_stat

0.4959590177573887

In [9]:
avocado_info2['AveragePrice'].sample(frac = 1)

9620     1.49
12944    1.19
13096    1.20
2160     0.87
6762     0.98
         ... 
11699    1.84
18240    1.54
10035    2.40
7180     1.54
1441     0.97
Name: AveragePrice, Length: 18249, dtype: float64

In [10]:
avocado_info2['AveragePrice'].sample(frac = 1).reset_index()

Unnamed: 0,index,AveragePrice
0,17817,1.31
1,13763,1.67
2,9595,1.47
3,120,1.18
4,11754,1.73
...,...,...
18244,15047,2.05
18245,1338,1.19
18246,3682,1.17
18247,12967,1.37


In [11]:
avocado_info2['AveragePrice'].sample(frac = 1).reset_index(drop = True)

0        1.33
1        2.12
2        1.75
3        0.78
4        1.16
         ... 
18244    1.32
18245    1.99
18246    1.15
18247    1.17
18248    1.47
Name: AveragePrice, Length: 18249, dtype: float64

In [12]:
def perm(data):
    return data.sample(frac = 1).reset_index(drop = True)

In [13]:
sim_test_stat = np.array([])

reps = 1000

for i in range(reps):
    perm_info = perm(avocado_info2['AveragePrice'])
    
    # create two columns
    df = pd.DataFrame({'Permuted Price': perm_info, 'type': avocado_info2['type']})
    
    # create two subsets
    organic = df.loc[df['type'] == 'organic', 'Permuted Price']
    not_organic = df.loc[df['type'] == 'conventional', 'Permuted Price']
    
    stat = np.mean(organic) - np.mean(not_organic)
    
    sim_test_stat = np.append(sim_test_stat, stat)

In [14]:
sim_test_stat

array([ 1.97906030e-03,  5.43788006e-03,  1.36313612e-03,  5.24060897e-03,
       -1.21453943e-03,  9.47536167e-03,  9.07643569e-03, -1.08302538e-03,
       -8.47630740e-03,  9.32631240e-03,  6.29710524e-03, -2.77736484e-03,
       -1.06329827e-03, -1.63100062e-03,  2.35168347e-03,  1.70741988e-04,
       -4.73676901e-04, -5.35050129e-04, -1.72744427e-03, -4.82240846e-03,
        1.54287201e-03,  3.62298604e-03, -3.17409892e-03, -2.66119408e-03,
        1.23533277e-02, -1.80416080e-03,  1.56937848e-02, -6.73374612e-03,
        1.05187065e-02,  9.81949013e-03, -2.70941591e-03, -1.04336750e-02,
        7.27250118e-03,  6.34313516e-03,  3.47612867e-03, -8.43027748e-03,
        7.80090462e-04,  1.36971183e-03,  1.28203579e-03,  3.09254600e-03,
       -2.81024335e-03, -2.53625573e-03, -4.40813917e-03,  2.71569434e-04,
        1.30395480e-03, -6.51220881e-04,  5.08294740e-04, -1.81073650e-03,
        1.48822978e-04, -6.35877574e-04,  4.78469356e-03,  6.06037994e-03,
       -1.03613422e-02,  

In [19]:
p_value = np.count_nonzero(sim_test_stat >= observed_test_stat) / reps

In [20]:
p_value

0.0