In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, cross_validate

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression

In [23]:
path = '../data/winemag-clean.csv'
wine_clean = pd.read_csv(path, index_col = 0)
wine_clean.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,vintage,vintage_str_data,scaled_points
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,not_specified,not_specified,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,2011 Avidagos Red (Douro),44.317767
2,US,"Tart and snappy, the flavors of lime flesh and...",no_designation,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,2013 Pinot Gris (Willamette Valley),41.982247
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,not_specified,Alexander Peartree,no_twitter,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,2013 Reserve Late Harvest Riesling (Lake Mich...,57.575804
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,2012 Vintner's Reserve Wild Child Block Pinot...,41.982247
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,not_specified,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011,2011 Ars In Vitro Tempranillo-Merlot (Navarra),50.921302


In [24]:
# Try all three values 
wine_x = wine_clean[['designation', 'region_1', 'winery','price']]

In [25]:
# Check number of unique values
print('unique designations:', wine_x['designation'].nunique())
print('unique regions:', wine_x['region_1'].nunique())
print('unique wineries:', wine_x['winery'].nunique())

unique designations: 35754
unique regions: 1205
unique wineries: 15843


In [26]:
# Too many values - previous iterations took
# too long to run - Remove values that only appear once from:
# https://stackoverflow.com/questions/
# 33071182/remove-values-that-appear-only-once-in-a-dataframe-column
wine_x_small = wine_x[wine_x.groupby('designation')['designation'].transform(len) > 20]


In [27]:
wine_x_small = wine_x_small[wine_x_small.groupby('region_1')['region_1'].transform(len) > 50]

In [28]:
wine_x_small = wine_x_small[wine_x_small.groupby('winery')['winery'].transform(len) > 50]

In [29]:
# Check again
print(wine_x_small['designation'].nunique())
print(wine_x_small['region_1'].nunique())
print(wine_x_small['winery'].nunique())

59
53
23


In [33]:
designation_dummy = pd.get_dummies(wine_x_small['designation'], 
                                  prefix = 'designation')
region_1_dummy = pd.get_dummies(wine_x_small['region_1'],
                               prefix = 'region_1')
winery_dummy = pd.get_dummies(wine_x_small['winery'],
                             prefix = 'winery')

In [None]:
X = pd.concat([designation_dummy, region_1_dummy, winery_dummy], axis=1)

In [37]:
y = wine_x_small['price']

In [38]:
X.shape

(111537, 52802)

In [39]:
#Baseline - mean price
# start with a basic Train/Test split (70% training data, 30% testing data
# by default)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [40]:
# The 'null' value would be to predict that every price was equal to
# the average of all prices in the dataset
y_null = np.zeros_like(y_test, dtype=float)
y_null.fill(y.mean())
y_null

array([35.62627648, 35.62627648, 35.62627648, ..., 35.62627648,
       35.62627648, 35.62627648])

In [41]:
print('Null case RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_null)))

Null case RMSE: 39.679966809523435


In [15]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
pred = linreg.predict(X_test)

In [17]:
print('Linear Regression RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('Linear Regression R-squared:', metrics.r2_score(y_test, pred))

Linear Regression RMSE: 14020216702308.39
Linear Regression R-squared: -1.0481558720523691e+24


In [18]:
# Try each main metric individually - make a function
def feature_test(features, unique=0, num=1):
    for feature in features:
        wine_x = wine_clean[[feature, 'price']]
        if feature != 'points':
            if unique:
                wine_x = wine_x[wine_x.groupby(feature)[feature].transform(len) > unique]
            X = pd.get_dummies(wine_x[feature], prefix=feature)
        else: 
            X = wine_x[[feature]]
        
        y = wine_x['price']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
        
        y_null = np.zeros_like(y_test, dtype=float)
        y_null.fill(y.mean())
        print('Null case RMSE for ' + feature + ':', 
              np.sqrt(metrics.mean_squared_error(y_test, y_null)))
        
        linreg = LinearRegression()
        linreg.fit(X_train, y_train)
        
        pred = linreg.predict(X_test)
        
        print('Linear Regression RMSE for ' + feature + ':', 
              np.sqrt(metrics.mean_squared_error(y_test, pred)))
        print('Linear Regression R-squared for ' + feature + ':', 
              metrics.r2_score(y_test, pred))
        print()
    return

In [19]:
feature_test(['designation', 'region_1', 'winery'], unique = 20)

Null case RMSE for designation: 41.833526729426204
Linear Regression RMSE for designation: 40.982532139944084
Linear Regression R-squared for designation: 0.040260553472607596

Null case RMSE for region_1: 41.38965150283558
Linear Regression RMSE for region_1: 36.995787895706115
Linear Regression R-squared for region_1: 0.20103859878446195

Null case RMSE for winery: 37.48072763631314
Linear Regression RMSE for winery: 30.32948747305227
Linear Regression R-squared for winery: 0.3451670584226003



In [20]:
# Try with all columns for fun
feature_test(['country', 'designation', 'points', 'province', 
             'region_1', 'region_2', 'variety', 'winery', 'vintage'], unique=20)

Null case RMSE for country: 40.14606487339159
Linear Regression RMSE for country: 39.71477405927935
Linear Regression R-squared for country: 0.021336108078201743

Null case RMSE for designation: 41.833526729426204
Linear Regression RMSE for designation: 40.982532139944084
Linear Regression R-squared for designation: 0.040260553472607596

Null case RMSE for points: 39.679966809523435
Linear Regression RMSE for points: 35.63286663427134
Linear Regression R-squared for points: 0.19358427991904248

Null case RMSE for province: 37.86696459968853
Linear Regression RMSE for province: 36.09110923162204
Linear Regression R-squared for province: 0.09153390752807988

Null case RMSE for region_1: 41.38965150283558
Linear Regression RMSE for region_1: 36.995787895706115
Linear Regression R-squared for region_1: 0.20103859878446195

Null case RMSE for region_2: 39.679966809523435
Linear Regression RMSE for region_2: 38.9033683913871
Linear Regression R-squared for region_2: 0.038759917885619055

Nul

In [21]:
# While this is suprising - pretty clear that these items work unto themselves but it isn't
# A linear relationship.  Need to use a random forest for this 