In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
df = pd.read_csv("winemag-data-130k-v2.csv")

In [3]:
df = df[df['price'].notna()] # remove invalid data
df = df[df['country'] == 'US'] # for this assignment, only use US data

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
12,12,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


In [5]:
df.dtypes

Unnamed: 0                 int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

# Task 1.1 Baseline Model
#### For task 1.1, we only pick the non-text features, so we dropped 'description' and 'title', both of which are textual data. We also dropped the first column and 'taster twitter handle' because they are unrelated. 

In [6]:
X_nt, Y = df.drop(labels=["Unnamed: 0", "description", "taster_twitter_handle", "title", "price"], axis=1), df["price"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_nt, Y, random_state=1)

In [8]:
categorical = X_nt.dtypes == object

In [9]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())
    
preprocess = make_column_transformer(
    (cat_preprocessing, categorical),
    (cont_preprocessing, ~categorical))

In [34]:
ridge = make_pipeline(preprocess, Ridge())
pipe = ridge.fit(X_train, y_train)

In [35]:
print('Baseline Model: {:.2f}'.format(ridge.score(X_test, y_test)))

Baseline Model: 0.58


# Task 1.2 Simple BOW Model

In [36]:
X_t, Y = df["description"] + df['title'], df["price"]

In [37]:
X_t

2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
10        Soft, supple plum envelopes an oaky structure ...
12        Slightly reduced, this wine offers a chalky, t...
                                ...                        
129945    Hailing from one of the more popular vineyards...
129949    There's no bones about the use of oak in this ...
129950    This opens with herbaceous dollops of thyme an...
129952    This Zinfandel from the eastern section of Nap...
129967    Citation is given as much as a decade of bottl...
Length: 54265, dtype: object

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y, random_state=1)

In [52]:
vect = CountVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [53]:
default_ridge = Ridge().fit(X_train_vect, y_train)

In [54]:
print('default BOW: {:.2f}'.format(default_ridge.score(X_test_vect, y_test)))

default BOW: 0.48


# Task 1.3 Tuning BOW Model
#### I tested how n-grams, characters, tf-idf rescaling individually affects performance.

In [55]:
ngram_vect = CountVectorizer(ngram_range=(1,4))
X_train_ngram = ngram_vect.fit_transform(X_train)
X_test_ngram = ngram_vect.transform(X_test)

In [56]:
ngram_ridge = Ridge().fit(X_train_ngram, y_train)

In [57]:
print('ngram BOW: {:.2f}'.format(ngram_ridge.score(X_test_ngram, y_test)))

ngram BOW: 0.57


In [58]:
vect_char = CountVectorizer(ngram_range=(2,5), analyzer='char_wb')
X_train_char = vect_char.fit_transform(X_train)
X_test_char = vect_char.transform(X_test)

In [59]:
char_ridge = Ridge().fit(X_train_char, y_train)

In [60]:
print('char BOW: {:.2f}'.format(char_ridge.score(X_test_char, y_test)))

char BOW: 0.19


In [61]:
tfidf_vect = make_pipeline(CountVectorizer(), TfidfTransformer())
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [62]:
tfidf_ridge = Ridge().fit(X_train_tfidf, y_train)

In [63]:
print('tfidf BOW: {:.2f}'.format(tfidf_ridge.score(X_test_tfidf, y_test)))

tfidf BOW: 0.53


#### n_grams seem to improve the result the most, so we decided to proceeed with BOW tuned with n_grams.
# Task 1.4 Text + Non-text Features

In [90]:
X, y = df.drop(labels=["Unnamed: 0", "country", "taster_twitter_handle", "price"], axis=1), df["price"]

In [91]:
# combine text features for simpler preprocessing
X['text'] = X["description"] + X["title"]
X.drop(labels=['description', 'title'], axis=1, inplace=True)

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [93]:
X_train.dtypes

designation    object
points          int64
province       object
region_1       object
region_2       object
taster_name    object
variety        object
winery         object
text           object
dtype: object

In [94]:
continuous = ['points']
categorical = ['designation', 'province', 'region_1', 'region_2', 'taster_name', 'variety', 'winery']

In [95]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())
    
preprocess = make_column_transformer(
    (CountVectorizer(ngram_range=(1,4)), 'text'),
    (cont_preprocessing, continuous),
    (cat_preprocessing, categorical))

In [196]:
pipe_ridge = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_ridge, X_train, y_train, cv=5)
np.mean(scores)

0.5761289448384611

In [97]:
ridge = make_pipeline(preprocess, Ridge())
pipe = ridge.fit(X_train, y_train)

In [98]:
print('Text + Non-text Ridge: {:.2f}'.format(ridge.score(X_test, y_test)))

Text + Non-text Ridge: 0.60


#### The accuracy improved from 0.48 to 0.60 by adding non-text features to the tuned BOW model. Adding non-text features helps because they provide additional valuable information abou the dataset.