In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_1samp
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
from scipy.stats import bartlett

In [2]:
hs = pd.read_csv("Housing.csv")
hs.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [3]:
hs.shape

(546, 12)

In [4]:
hs.columns

Index(['price', 'lotsize', 'bedrooms', 'bathrms', 'stories', 'driveway',
       'recroom', 'fullbase', 'gashw', 'airco', 'garagepl', 'prefarea'],
      dtype='object')

In [5]:
hs.describe()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl
count,546.0,546.0,546.0,546.0,546.0,546.0
mean,68121.59707,5150.265568,2.965201,1.285714,1.807692,0.692308
std,26702.670926,2168.158725,0.737388,0.502158,0.868203,0.861307
min,25000.0,1650.0,1.0,1.0,1.0,0.0
25%,49125.0,3600.0,2.0,1.0,1.0,0.0
50%,62000.0,4600.0,3.0,1.0,2.0,0.0
75%,82000.0,6360.0,3.0,2.0,2.0,1.0
max,190000.0,16200.0,6.0,4.0,4.0,3.0


In [6]:
hs['prefarea'].unique()

array(['no', 'yes'], dtype=object)

In [7]:
Y = hs[hs['prefarea']=='yes']
N = hs[hs['prefarea']=='no']

In [8]:
bartlett(Y['price'],N['price'])

BartlettResult(statistic=5.077175886434614, pvalue=0.0242428265503372)

In [9]:
ttest_ind(Y['price'],N['price'],alternative='greater',equal_var=False)

Ttest_indResult(statistic=7.478383024308103, pvalue=1.4106605533348066e-12)

In [10]:
hs.groupby('prefarea')['price'].mean()

prefarea
no     63263.485646
yes    83986.367188
Name: price, dtype: float64

In [11]:
Y = hs[hs['airco']=='yes']
N = hs[hs['airco']=='no']

In [12]:
bartlett(Y['price'],N['price'])

BartlettResult(statistic=20.661740703745085, pvalue=5.480011842564586e-06)

In [13]:
ttest_ind(Y['price'],N['price'],alternative='greater',equal_var=False)

Ttest_indResult(statistic=10.698827321037546, pvalue=9.657344761913888e-23)

In [14]:
ttest_ind(Y['price'],N['price'],alternative='less',equal_var=False)

Ttest_indResult(statistic=10.698827321037546, pvalue=1.0)

In [15]:
hs.groupby('airco')['price'].mean()

airco
no     59884.852547
yes    85880.589595
Name: price, dtype: float64

In [16]:
hs['gashw'].unique()

array(['no', 'yes'], dtype=object)

In [17]:
Y1 = hs[hs['gashw']=='yes']
N1 = hs[hs['gashw']=='no']

In [18]:
bartlett(Y1['price'],N1['price'])

BartlettResult(statistic=0.9637521508951223, pvalue=0.32624333311820414)

In [19]:
ttest_ind(Y1['price'],N1['price'],alternative='greater',equal_var=False)

Ttest_indResult(statistic=1.9151312437265837, pvalue=0.0333181910913488)

In [20]:
ttest_ind(Y1['price'],N1['price'],alternative='less',equal_var=False)

Ttest_indResult(statistic=1.9151312437265837, pvalue=0.9666818089086512)

In [21]:
ttest_ind(Y1['price'],N1['price'],equal_var=False)

Ttest_indResult(statistic=1.9151312437265837, pvalue=0.0666363821826976)

In [22]:
hs.groupby('gashw')['price'].mean()

gashw
no     67579.06334
yes    79428.00000
Name: price, dtype: float64

# Chi Square

In [23]:
from scipy.stats import chi2_contingency

In [24]:
hs.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [25]:
ctab = pd.crosstab(hs['prefarea'],hs['gashw'])
ctab

gashw,no,yes
prefarea,Unnamed: 1_level_1,Unnamed: 2_level_1
no,396,22
yes,125,3


In [26]:
test_statistic, p_value, df, expected_frequencies = chi2_contingency(ctab)
print(test_statistic)
print(p_value)
print(df)
print(expected_frequencies)

1.3017707051446878
0.25389003711306934
1
[[398.86080586  19.13919414]
 [122.13919414   5.86080586]]
