In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sb
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sqlalchemy import create_engine, insert
import sqlalchemy as sal
%matplotlib inline

In [2]:
# establish connection to the database
engine = sal.create_engine('sqlite:///winewebscraping.db')
# storing tables from database into dataframe
master = pd.read_sql('SELECT * FROM master', engine)
ratings = pd.read_sql('SELECT * FROM ratings', engine)
varietals = pd.read_sql('SELECT * FROM varietals', engine)
california = pd.read_sql('SELECT * FROM california', engine)

In [3]:
ratings.head(2)

Unnamed: 0,product_name,price,varietal,rating,rating_count,appellation,region,year
0,Dom Perignon Vintage with Gift Box,199,Vintage Sparkling Wine,4.5,42,['Champagne'],France,2010
1,Duckhorn Napa Valley Cabernet Sauvignon,78,Cabernet Sauvignon,4.3,62,['Napa Valley'],California,2017


## Univariate Regression

In [4]:
ratings['price'] = ratings['price'].astype(int)

In [5]:
# null hypothesis in thise case would be that more ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,4.965
Date:,"Tue, 29 Dec 2020",Prob (F-statistic):,0.0259
Time:,10:25:22,Log-Likelihood:,-32002.0
No. Observations:,4111,AIC:,64010.0
Df Residuals:,4109,BIC:,64020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,155.9239,10.146,15.368,0.000,136.032,175.815
rating_count,-0.4032,0.181,-2.228,0.026,-0.758,-0.048

0,1,2,3
Omnibus:,7883.124,Durbin-Watson:,1.802
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10355980.765
Skew:,15.025,Prob(JB):,0.0
Kurtosis:,247.039,Cond. No.,62.7


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the number of ratings do not affect the price of the wine. 

In [6]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,44.9
Date:,"Tue, 29 Dec 2020",Prob (F-statistic):,2.36e-11
Time:,10:25:22,Log-Likelihood:,-31982.0
No. Observations:,4111,AIC:,63970.0
Df Residuals:,4109,BIC:,63980.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-483.6446,94.370,-5.125,0.000,-668.662,-298.627
rating,146.9356,21.929,6.701,0.000,103.944,189.927

0,1,2,3
Omnibus:,7901.121,Durbin-Watson:,1.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10491957.605
Skew:,15.099,Prob(JB):,0.0
Kurtosis:,248.642,Cond. No.,47.4


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the higher ratings do not affect the price of the wine.

In [7]:
# null hypothesis in thise case would be that higher ratings don't affect the price of the wine
ratings['intercept'] = 1
lm = sm.OLS(ratings['rating'], ratings[['intercept', 'year']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,3.745
Date:,"Tue, 29 Dec 2020",Prob (F-statistic):,0.053
Time:,10:25:23,Log-Likelihood:,-2182.9
No. Observations:,4111,AIC:,4370.0
Df Residuals:,4109,BIC:,4382.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,10.2115,3.063,3.334,0.001,4.206,16.217
year,-0.0029,0.002,-1.935,0.053,-0.006,3.84e-05

0,1,2,3
Omnibus:,271.116,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,389.304
Skew:,-0.561,Prob(JB):,2.91e-85
Kurtosis:,4.006,Cond. No.,961000.0


Based on the close to 0 goodness-of-fit (R-squared) this means the null hypothesis cannot be rejeted. Therefore the year of the wine does not affect the price of the wine.

##  Bivariate Regression

In [8]:
ratings['intercept'] = 1
lm = sm.OLS(ratings['price'], ratings[['intercept', 'rating', 'rating_count']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,23.92
Date:,"Tue, 29 Dec 2020",Prob (F-statistic):,4.71e-11
Time:,10:25:23,Log-Likelihood:,-31981.0
No. Observations:,4111,AIC:,63970.0
Df Residuals:,4108,BIC:,63990.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-462.9894,95.121,-4.867,0.000,-649.478,-276.500
rating,143.9214,21.994,6.544,0.000,100.800,187.042
rating_count,-0.3083,0.181,-1.707,0.088,-0.662,0.046

0,1,2,3
Omnibus:,7903.618,Durbin-Watson:,1.807
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10514728.956
Skew:,15.109,Prob(JB):,0.0
Kurtosis:,248.91,Cond. No.,607.0


In [9]:
unique_varietals = varietals['varietal'].sort_values().unique()

In [10]:
unique_varietals = pd.get_dummies(varietals['varietal'])

In [11]:
unique_varietals

Unnamed: 0,Agiorgitiko,Aglianico,Albarino,Alicante Bouschet,Arneis,Assyrtiko,Baga,Barbera,Blaufrankisch,Bobal,...,Touriga Nacional,Tuscan Blends,Valdiguie,Verdejo,Verdicchio,Vermentino,Viognier,Viura,Xinomavro,Zinfandel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
master['intercept'] = 1
lm = sm.OLS(master['price'], master['intercept', unique_varietals])
results = lm.fit()
results.summary()

TypeError: '('intercept',        Agiorgitiko  Aglianico  Albarino  Alicante Bouschet  Arneis  Assyrtiko  \
0                0          0         0                  0       0          0   
1                0          0         0                  0       0          0   
2                0          0         0                  0       0          0   
3                0          0         0                  0       0          0   
4                0          0         0                  0       0          0   
...            ...        ...       ...                ...     ...        ...   
21365            0          0         0                  0       0          0   
21366            0          0         0                  0       0          0   
21367            0          0         0                  0       0          0   
21368            0          0         0                  0       0          0   
21369            0          0         0                  0       0          0   

       Baga  Barbera  Blaufrankisch  Bobal  ...  Touriga Nacional  \
0         0        0              0      0  ...                 0   
1         0        0              0      0  ...                 0   
2         0        0              0      0  ...                 0   
3         0        0              0      0  ...                 0   
4         0        0              0      0  ...                 0   
...     ...      ...            ...    ...  ...               ...   
21365     0        0              0      0  ...                 0   
21366     0        0              0      0  ...                 0   
21367     0        0              0      0  ...                 0   
21368     0        0              0      0  ...                 0   
21369     0        0              0      0  ...                 0   

       Tuscan Blends  Valdiguie  Verdejo  Verdicchio  Vermentino  Viognier  \
0                  0          0        0           0           0         0   
1                  0          0        0           0           0         0   
2                  0          0        0           0           0         0   
3                  0          0        0           0           0         0   
4                  0          0        0           0           0         0   
...              ...        ...      ...         ...         ...       ...   
21365              0          0        0           0           0         0   
21366              0          0        0           0           0         0   
21367              0          0        0           0           0         0   
21368              0          0        0           0           0         0   
21369              0          0        0           0           0         0   

       Viura  Xinomavro  Zinfandel  
0          0          0          0  
1          0          0          0  
2          0          0          0  
3          0          0          0  
4          0          0          0  
...      ...        ...        ...  
21365      0          0          0  
21366      0          0          0  
21367      0          0          0  
21368      0          0          0  
21369      0          0          0  

[21370 rows x 82 columns])' is an invalid key