In [21]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sb
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sqlalchemy import create_engine, insert
import sqlalchemy as sal
%matplotlib inline

In [22]:
# establish connection to the database
engine = sal.create_engine('sqlite:///winewebscraping.db')
# storing tables from database into dataframe
master = pd.read_sql('SELECT * FROM master', engine)
ratings = pd.read_sql('SELECT * FROM ratings', engine)
varietals = pd.read_sql('SELECT * FROM varietals', engine)

In [23]:
ratings.head(2)

Unnamed: 0,product_name,price,varietal,rating,rating_count,appellation,region,year
0,Dom Perignon Vintage with Gift Box,199,Vintage Sparkling Wine,4.5,42.0,['Champagne'],France,2010
1,Duckhorn Napa Valley Cabernet Sauvignon,78,Cabernet Sauvignon,4.3,62.0,['Napa Valley'],California,2017


## Univariate Regression

In [24]:
ratings['price'] = ratings['price'].astype(int)

In [25]:
# null hypothesis in thise case would be that more ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,4.999
Date:,"Thu, 17 Dec 2020",Prob (F-statistic):,0.0254
Time:,23:36:05,Log-Likelihood:,-32009.0
No. Observations:,4112,AIC:,64020.0
Df Residuals:,4110,BIC:,64040.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,155.9318,10.144,15.371,0.000,136.044,175.820
rating_count,-0.4040,0.181,-2.236,0.025,-0.758,-0.050

0,1,2,3
Omnibus:,7885.492,Durbin-Watson:,1.802
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10363562.623
Skew:,15.027,Prob(JB):,0.0
Kurtosis:,247.099,Cond. No.,62.8


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the number of ratings do not affect the price of the wine. 

In [26]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,44.88
Date:,"Thu, 17 Dec 2020",Prob (F-statistic):,2.37e-11
Time:,23:36:07,Log-Likelihood:,-31990.0
No. Observations:,4112,AIC:,63980.0
Df Residuals:,4110,BIC:,64000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-483.4844,94.358,-5.124,0.000,-668.476,-298.492
rating,146.8898,21.925,6.700,0.000,103.904,189.875

0,1,2,3
Omnibus:,7903.462,Durbin-Watson:,1.809
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10499378.682
Skew:,15.101,Prob(JB):,0.0
Kurtosis:,248.7,Cond. No.,47.4


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the higher ratings do not affect the price of the wine.

In [33]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['rating'], ratings[['intercept', 'year']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,4.015
Date:,"Thu, 17 Dec 2020",Prob (F-statistic):,0.0452
Time:,23:37:28,Log-Likelihood:,-2182.9
No. Observations:,4112,AIC:,4370.0
Df Residuals:,4110,BIC:,4382.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,10.1312,2.918,3.472,0.001,4.410,15.853
year,-0.0029,0.001,-2.004,0.045,-0.006,-6.25e-05

0,1,2,3
Omnibus:,271.199,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,389.537
Skew:,-0.561,Prob(JB):,2.59e-85
Kurtosis:,4.007,Cond. No.,915000.0


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the year of the wine does not affect the price of the wine.

##  Bivariate Regression

In [28]:
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,23.93
Date:,"Thu, 17 Dec 2020",Prob (F-statistic):,4.65e-11
Time:,23:36:10,Log-Likelihood:,-31988.0
No. Observations:,4112,AIC:,63980.0
Df Residuals:,4109,BIC:,64000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-462.7811,95.102,-4.866,0.000,-649.232,-276.331
rating,143.8766,21.990,6.543,0.000,100.764,186.989
rating_count,-0.3099,0.180,-1.718,0.086,-0.663,0.044

0,1,2,3
Omnibus:,7905.984,Durbin-Watson:,1.807
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10522374.708
Skew:,15.111,Prob(JB):,0.0
Kurtosis:,248.97,Cond. No.,608.0


In [29]:
unique_varietals = varietals['varietal'].sort_values().unique()

In [30]:
unique_varietals = pd.get_dummies(varietals['varietal'])

In [31]:
unique_varietals

Unnamed: 0,Agiorgitiko,Aglianico,Albarino,Alicante Bouschet,Arneis,Assyrtiko,Baga,Barbera,Blaufrankisch,Bobal,...,Touriga Nacional,Tuscan Blends,Valdiguie,Verdejo,Verdicchio,Vermentino,Viognier,Viura,Xinomavro,Zinfandel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
master['intercept'] = 1
lm = sm.OLS(master['price'], master['intercept', unique_varietals])
results = lm.fit()
results.summary()

TypeError: '('intercept',        Agiorgitiko  Aglianico  Albarino  Alicante Bouschet  Arneis  Assyrtiko  \
0                0          0         0                  0       0          0   
1                0          0         0                  0       0          0   
2                0          0         0                  0       0          0   
3                0          0         0                  0       0          0   
4                0          0         0                  0       0          0   
...            ...        ...       ...                ...     ...        ...   
21380            0          0         0                  0       0          0   
21381            0          0         0                  0       0          0   
21382            0          0         0                  0       0          0   
21383            0          0         0                  0       0          0   
21384            0          0         0                  0       0          0   

       Baga  Barbera  Blaufrankisch  Bobal  ...  Touriga Nacional  \
0         0        0              0      0  ...                 0   
1         0        0              0      0  ...                 0   
2         0        0              0      0  ...                 0   
3         0        0              0      0  ...                 0   
4         0        0              0      0  ...                 0   
...     ...      ...            ...    ...  ...               ...   
21380     0        0              0      0  ...                 0   
21381     0        0              0      0  ...                 0   
21382     0        0              0      0  ...                 0   
21383     0        0              0      0  ...                 0   
21384     0        0              0      0  ...                 0   

       Tuscan Blends  Valdiguie  Verdejo  Verdicchio  Vermentino  Viognier  \
0                  0          0        0           0           0         0   
1                  0          0        0           0           0         0   
2                  0          0        0           0           0         0   
3                  0          0        0           0           0         0   
4                  0          0        0           0           0         0   
...              ...        ...      ...         ...         ...       ...   
21380              0          0        0           0           0         0   
21381              0          0        0           0           0         0   
21382              0          0        0           0           0         0   
21383              0          0        0           0           0         0   
21384              0          0        0           0           0         0   

       Viura  Xinomavro  Zinfandel  
0          0          0          0  
1          0          0          0  
2          0          0          0  
3          0          0          0  
4          0          0          0  
...      ...        ...        ...  
21380      0          0          0  
21381      0          0          0  
21382      0          0          0  
21383      0          0          0  
21384      0          0          0  

[21385 rows x 82 columns])' is an invalid key