In [1]:
import math
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.weightstats import DescrStatsW

In [2]:
standard_m = 9.5
observed_m = 9.57
sig = 0.4
N = 160

Z = (observed_m - standard_m) / (sig / math.sqrt(N))
Z

2.213594362117875

In [10]:
round(2 * (1 - stats.norm.cdf(abs(Z))), 4)

0.0269

In [11]:
with open('diamonds.txt') as fp:
    print(fp.readline())

carat	depth	table	price	x	y	z



In [12]:
diamond_data = []
with open('diamonds.txt') as fp:
    fp.readline()
    for line in fp:
        data = line.strip().split('\t')
        if not data:
            continue
        diamond_data.append(dict(
            carat=float(data[0]),
            depth=float(data[1]),
            table=float(data[2]),
            price=float(data[3]),
            x=float(data[4]),
            y=float(data[5]),
            z=float(data[6]),
        ))
diamond_data = pd.DataFrame(diamond_data)

In [13]:
diamond_data.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,62.4,58.0,334.0,4.2,4.23,2.63
4,0.31,63.3,58.0,335.0,4.34,4.35,2.75


In [14]:
diamonds_train_data, diamonds_test_data = train_test_split(diamond_data, test_size=0.25, random_state=1)

In [15]:
diamonds_train_data.shape

(40455, 7)

In [16]:
diamonds_test_data.shape

(13485, 7)

In [17]:
feat_names = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [18]:
lin_est = LinearRegression()
#lin_est.fit(diamonds_train_data.drop('price', axis=1).values, diamonds_train_data.price.values)
lin_est.fit(diamonds_train_data[feat_names], diamonds_train_data.price)

LinearRegression()

In [19]:
lin_est.coef_

array([10479.64722682,  -199.87725876,  -102.34157431, -1224.74425649,
          41.70845285,    71.50629605])

In [20]:
lin_forecast = lin_est.predict(diamonds_test_data[feat_names])
lin_forecast

array([ 778.42235621, 6930.46053685, 2520.12128307, ..., 5456.49180326,
       3630.17181904, 1203.67470294])

In [21]:
rf_est = RandomForestRegressor(random_state=1)
rf_est.fit(diamonds_train_data[feat_names], diamonds_train_data.price)

RandomForestRegressor(random_state=1)

In [22]:
rf_forecast = rf_est.predict(diamonds_test_data[feat_names])
rf_forecast

array([ 846.78871429, 6712.69      , 2256.71      , ..., 4209.68      ,
       3464.54      , 1072.57166667])

In [35]:
lin_abs_deltas = np.abs(lin_forecast - diamonds_test_data.price)
lin_abs_deltas

2714      214.422356
14653    1016.460537
52760      41.878717
48658       7.165645
14812    1113.152701
            ...     
50310     898.690921
39735      49.903573
2915     2171.491803
4960      103.828181
42912     159.325297
Name: price, Length: 13485, dtype: float64

In [27]:
#lin_abs_deltas = np.array([abs(x) for x in lin_forecast - diamonds_test_data.price.values])
#lin_abs_deltas

array([ 214.42235621, 1016.46053685,   41.87871693, ..., 2171.49180326,
        103.82818096,  159.32529706])

In [28]:
#rf_abs_deltas = np.array([abs(x) for x in rf_forecast - diamonds_test_data.price.values])
rf_abs_deltas = np.abs(rf_forecast - diamonds_test_data.price)
rf_abs_deltas

2714      282.788714
14653     798.690000
52760     305.290000
48658      59.655000
14812    1905.700000
            ...     
50310     224.860000
39735     182.770000
2915      924.680000
4960      269.460000
42912     290.428333
Name: price, Length: 13485, dtype: float64

In [29]:
lin_diff = lin_abs_deltas.mean()
lin_diff

890.3764004285603

In [30]:
rf_diff = rf_abs_deltas.mean()
rf_diff

779.711758355821

In [38]:
np.abs(lin_diff - rf_diff)

110.6646420727393

In [31]:
res = stats.ttest_rel(rf_abs_deltas, lin_abs_deltas)
res

Ttest_relResult(statistic=-18.03725974451115, pvalue=6.936823477561721e-72)

In [32]:
round(res.pvalue, 4)

0.0

In [36]:
lin_abs_deltas.describe()

count    13485.000000
mean       890.376400
std       1161.068461
min          0.035915
25%        208.160385
50%        485.297677
75%       1098.149270
max      18239.846360
Name: price, dtype: float64

In [37]:
rf_abs_deltas.describe()

count    13485.000000
mean       779.711758
std       1113.574813
min          0.096000
25%        119.340000
50%        316.490000
75%        962.280000
max      13484.020000
Name: price, dtype: float64

In [33]:
interval = DescrStatsW(lin_abs_deltas - rf_abs_deltas, ddof=1).tconfint_mean()
interval

(98.63852573905375, 122.69075840642985)

In [84]:
round(interval[0], 1)

98.6