In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

wine_base = pd.read_csv("data/winemag-data_first150k.csv", index_col=0)
wine_base.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [4]:
from sklearn.model_selection import train_test_split
X = wine_base.drop(['points'], axis=1)
y = wine_base['points'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)
vect = CountVectorizer(min_df=5)
vect.fit(X_train['description'])
print("vocabulary size: {}".format(len(vect.vocabulary_)))
X_train_vectored = vect.transform(X_train['description'])

vocabulary size: 12594


In [6]:
feature_names = vect.get_feature_names()
print("Number of Features: ", len(feature_names))
print("First 20 features: \n{}".format(feature_names[:20]))
print("features 10010 to 10030:\n{}".format(feature_names[10010:10030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of Features:  12594
First 20 features: 
['000', '01', '02', '02s', '03', '04', '05', '06', '06s', '07', '07s', '08', '09', '10', '100', '1000', '101', '10th', '11', '110']
features 10010 to 10030:
['shroud', 'shrouded', 'shu', 'shucked', 'shut', 'shuts', 'shy', 'shyness', 'sibling', 'siblings', 'sichuan', 'sicilia', 'sicilian', 'sicily', 'side', 'sided', 'sides', 'sideshow', 'sideways', 'sidewood']
Every 2000th feature:
['000', 'cesanese', 'excitement', 'kernel', 'pavone', 'shown', 'ventures']


In [7]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV

lr = LinearRegression()
lr.fit(X_train_vectored, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
from sklearn.metrics import mean_squared_error
y_pred = lr.predict(X_train_vectored)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
rmse

1.606002605601205

In [9]:
X_test_vectored = vect.transform(X_test['description'])
y_test_pred = lr.predict(X_test_vectored)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
rmse

1.8183378152748424