In [86]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
df = pd.read_csv("winemag-data-130k-v2.csv")

In [30]:
df = df[df['price'].notna()]
df = df[df['country'] == 'US']

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
12,12,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


In [155]:
df.dtypes

Unnamed: 0                 int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

# Task 1.1 Baseline Model
#### For task 1.1, we only pick the non-text features, so we dropped 'description' and 'title', both of which are textual data. We also dropped the first column and 'taster twitter handle' because they are unrelated. 

In [33]:
X_nt, Y = df.drop(labels=["Unnamed: 0", "description", "taster_twitter_handle", "title", "price"], axis=1), df["price"]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_nt, Y, random_state=1)

In [35]:
categorical = X_nt.dtypes == object

In [36]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())
    
preprocess = make_column_transformer(
    (cat_preprocessing, categorical),
    (cont_preprocessing, ~categorical))

In [37]:
pipe_ridge = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_ridge, X_train, y_train, cv=5)
np.mean(scores)

0.6448609818113782

# Task 1.2 Simple BOW Model

In [156]:
X_t, Y = df["description"], df["price"]

In [157]:
X_t

2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
10        Soft, supple plum envelopes an oaky structure ...
12        Slightly reduced, this wine offers a chalky, t...
                                ...                        
129945    Hailing from one of the more popular vineyards...
129949    There's no bones about the use of oak in this ...
129950    This opens with herbaceous dollops of thyme an...
129952    This Zinfandel from the eastern section of Nap...
129967    Citation is given as much as a decade of bottl...
Name: description, Length: 54265, dtype: object

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y, random_state=1)

In [159]:
vect = CountVectorizer()
X_train_trans = vect.fit_transform(X_train)
X_test_trans = vect.transform(X_test)

In [160]:
ridge_default = Ridge().fit(X_train_trans, y_train)

In [161]:
print('default BOW: {:.2f}'.format(ridge_default.score(X_test_trans, y_test)))

default BOW: 0.28


# Task 1.3 Tuning BOW Model
#### I tested how n-grams, characters, tf-idf rescaling individually affects performance.

In [71]:
vect_ngram = CountVectorizer(ngram_range=(1,4))
X_train_ngram = vect_ngram.fit_transform(X_train)
X_test_ngram = vect_ngram.transform(X_test)

In [72]:
ridge_ngram = Ridge().fit(X_train_ngram, y_train)

In [73]:
print('ngram BOW: {:.2f}'.format(ridge_ngram.score(X_test_ngram, y_test)))

ngram BOW: 0.40


In [114]:
vect_char = CountVectorizer(analyzer='char')
X_train_char = vect_char.fit_transform(X_train)
X_test_char = vect_char.transform(X_test)

In [115]:
ridge_char = Ridge().fit(X_train_char, y_train)

In [116]:
print('char BOW: {:.2f}'.format(ridge_char.score(X_test_char, y_test)))

char BOW: 0.20


In [87]:
vect_tfidf = make_pipeline(CountVectorizer(), TfidfTransformer())
X_train_tfidf = vect_tfidf.fit_transform(X_train)
X_test_tfidf = vect_tfidf.transform(X_test)

In [88]:
ridge_tfidf = Ridge().fit(X_train_tfidf, y_train)

In [89]:
print('tfidf BOW: {:.2f}'.format(ridge_tfidf.score(X_test_tfidf, y_test)))

tfidf BOW: 0.36


#### Now, combinations

In [127]:
vect_combo = CountVectorizer(ngram_range=(1,4), min_df=2)
X_train_combo = vect_combo.fit_transform(X_train)
X_test_combo = vect_combo.transform(X_test)

In [128]:
ridge_combo = Ridge().fit(X_train_combo, y_train)

In [131]:
print('combo BOW: {:.2f}'.format(ridge_combo.score(X_test_combo, y_test)))

combo BOW: 0.29


# Task 1.4 Text + Non-text Features

In [132]:
X, y = df.drop(labels=["Unnamed: 0", "taster_twitter_handle", "title", "price"], axis=1), df["price"]

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [143]:
X_train

Unnamed: 0,country,description,designation,points,province,region_1,region_2,taster_name,variety,winery
23071,US,"While this bottling is always interesting, it ...",Sarmento Vineyard,91,California,Santa Lucia Highlands,Central Coast,,Pinot Noir,Windstream
32875,US,DeLille's D2 red (51% Merlot/38% Cabernet Sauv...,D2,92,Washington,Columbia Valley (WA),Columbia Valley,Paul Gregutt,Red Blend,DeLille
100373,US,David and Anna deLaski follow her Austrian her...,Delanda Vineyard,92,California,Santa Ynez Valley,Central Coast,Matt Kettmann,Blaufränkisch,Solminer
15014,US,This especially ripe and deep style of Barbera...,Estate,91,California,El Dorado,Sierra Foothills,Jim Gordon,Barbera,Miraflores
93976,US,"A tough, gritty, drily astringent wine with fl...",Scoprire,82,California,California,California Other,,Red Blend,Millésimé
...,...,...,...,...,...,...,...,...,...,...
120150,US,Very appealing aromas of warm strawberry-cherr...,Best Barrel Blend,91,California,Paso Robles,Central Coast,Matt Kettmann,Syrah-Mourvèdre,Anglim
78653,US,"Composed primarily of Lemberger, this wine is ...",Cote de Columbia,87,Washington,Washington,Washington Other,Sean P. Sullivan,Red Blend,Shooting Star
12582,US,"Crisp apple and pear aromas take a richer, rip...",2nd Degree Medium Sweet,87,New York,Seneca Lake,Finger Lakes,Anna Lee C. Iijima,Riesling,Three Brothers
29549,US,"A minor wine, dry, tannic and awkward, with mo...",Jack London Vineyard,84,California,Sonoma Valley,Sonoma,,Merlot,Kenwood


In [153]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())

text_preprocessing = make_pipeline(
    CountVectorizer(ngram_range=(1,4))
    )
    
preprocess = make_column_transformer(
    (text_preprocessing, 'description'),
    (cat_preprocessing, categorical),
    remainder=cont_preprocessing)

In [154]:
pipe_ridge = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_ridge, X_train, y_train, cv=5)
np.mean(scores)

nan