In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sb
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sqlalchemy import create_engine, insert
import sqlalchemy as sal
%matplotlib inline

In [2]:
# establish connection to the database
engine = sal.create_engine('sqlite:///winewebscraping.db')
# storing tables from database into dataframe
master = pd.read_sql('SELECT * FROM master', engine)
ratings = pd.read_sql('SELECT * FROM ratings', engine)
varietals = pd.read_sql('SELECT * FROM varietals', engine)

In [3]:
ratings.head(2)

Unnamed: 0,product_name,price,varietal,rating,rating_count,appellation,region,year
0,Dom Perignon Vintage with Gift Box,199,Vintage Sparkling Wine,4.5,42.0,['Champagne'],France,2010
1,Duckhorn Napa Valley Cabernet Sauvignon,78,Cabernet Sauvignon,4.3,62.0,['Napa Valley'],California,2017


## Univariate Regression

In [8]:
ratings['price'] = ratings['price'].astype(int)

In [9]:
# null hypothesis in thise case would be that more ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,42.46
Date:,"Tue, 15 Dec 2020",Prob (F-statistic):,7.38e-11
Time:,21:50:16,Log-Likelihood:,-170600.0
No. Observations:,22004,AIC:,341200.0
Df Residuals:,22002,BIC:,341200.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,235.7938,3.873,60.885,0.000,228.203,243.385
rating_count,-1.0412,0.160,-6.516,0.000,-1.354,-0.728

0,1,2,3
Omnibus:,38528.502,Durbin-Watson:,1.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42883382.014
Skew:,12.523,Prob(JB):,0.0
Kurtosis:,217.816,Cond. No.,24.7


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the number of ratings do not affect the price of the wine. 

In [11]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,99.21
Date:,"Tue, 15 Dec 2020",Prob (F-statistic):,2.5499999999999998e-23
Time:,21:55:51,Log-Likelihood:,-170570.0
No. Observations:,22004,AIC:,341200.0
Df Residuals:,22002,BIC:,341200.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,248.9227,4.204,59.217,0.000,240.683,257.162
rating,-22.5083,2.260,-9.960,0.000,-26.938,-18.079

0,1,2,3
Omnibus:,38625.965,Durbin-Watson:,1.858
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43559704.949
Skew:,12.587,Prob(JB):,0.0
Kurtosis:,219.512,Cond. No.,2.2


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the higher ratings do not affect the price of the wine.

In [12]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'year']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.7366
Date:,"Tue, 15 Dec 2020",Prob (F-statistic):,0.391
Time:,21:57:42,Log-Likelihood:,-170620.0
No. Observations:,22004,AIC:,341300.0
Df Residuals:,22002,BIC:,341300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,326.0563,110.926,2.939,0.003,108.634,543.478
year,-0.0473,0.055,-0.858,0.391,-0.155,0.061

0,1,2,3
Omnibus:,38478.58,Durbin-Watson:,1.853
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42538258.645
Skew:,12.49,Prob(JB):,0.0
Kurtosis:,216.946,Cond. No.,58700.0


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the year of the wine does not affect the price of the wine.

##  Bivariate Regression

In [10]:
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,53.42
Date:,"Tue, 15 Dec 2020",Prob (F-statistic):,7.2e-24
Time:,21:51:24,Log-Likelihood:,-170570.0
No. Observations:,22004,AIC:,341100.0
Df Residuals:,22001,BIC:,341200.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,248.9902,4.203,59.241,0.000,240.752,257.228
rating,-19.7769,2.467,-8.016,0.000,-24.613,-14.941
rating_count,-0.4803,0.174,-2.757,0.006,-0.822,-0.139

0,1,2,3
Omnibus:,38630.568,Durbin-Watson:,1.858
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43595250.02
Skew:,12.59,Prob(JB):,0.0
Kurtosis:,219.6,Cond. No.,27.8


In [15]:
unique_varietals = varietals['varietal'].sort_values().unique()

In [17]:
unique_varietals = pd.get_dummies(varietals['varietal'])

In [19]:
unique_varietals

Unnamed: 0,Agiorgitiko,Aglianico,Albarino,Alicante Bouschet,Arneis,Assyrtiko,Baga,Barbera,Blaufrankisch,Bobal,...,Tuscan Blends,Valdiguie,Verdejo,Verdicchio,Vermentino,Vintage Sparkling Wine,Viognier,Viura,Xinomavro,Zinfandel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
master['intercept'] = 1
lm = sm.OLS(master['price'], master['intercept', unique_varietals])
results = lm.fit()
results.summary()

TypeError: '('intercept',        Agiorgitiko  Aglianico  Albarino  Alicante Bouschet  Arneis  Assyrtiko  \
0                0          0         0                  0       0          0   
1                0          0         0                  0       0          0   
2                0          0         0                  0       0          0   
3                0          0         0                  0       0          0   
4                0          0         0                  0       0          0   
...            ...        ...       ...                ...     ...        ...   
21999            0          0         0                  0       0          0   
22000            0          0         0                  0       0          0   
22001            0          0         0                  0       0          0   
22002            0          0         0                  0       0          0   
22003            0          0         0                  0       0          0   

       Baga  Barbera  Blaufrankisch  Bobal  ...  Tuscan Blends  Valdiguie  \
0         0        0              0      0  ...              0          0   
1         0        0              0      0  ...              0          0   
2         0        0              0      0  ...              0          0   
3         0        0              0      0  ...              0          0   
4         0        0              0      0  ...              0          0   
...     ...      ...            ...    ...  ...            ...        ...   
21999     0        0              0      0  ...              0          0   
22000     0        0              0      0  ...              0          0   
22001     0        0              0      0  ...              0          0   
22002     0        0              0      0  ...              0          0   
22003     0        0              0      0  ...              0          0   

       Verdejo  Verdicchio  Vermentino  Vintage Sparkling Wine  Viognier  \
0            0           0           0                       1         0   
1            0           0           0                       0         0   
2            0           0           0                       0         0   
3            0           0           0                       0         0   
4            0           0           0                       0         0   
...        ...         ...         ...                     ...       ...   
21999        0           0           0                       0         0   
22000        0           0           0                       0         0   
22001        0           0           0                       0         0   
22002        0           0           0                       0         0   
22003        0           0           0                       0         0   

       Viura  Xinomavro  Zinfandel  
0          0          0          0  
1          0          0          0  
2          0          0          0  
3          0          0          0  
4          0          0          0  
...      ...        ...        ...  
21999      0          0          0  
22000      0          0          0  
22001      0          0          0  
22002      0          0          0  
22003      0          0          0  

[22004 rows x 84 columns])' is an invalid key