In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
# Read data
df = pd.read_csv("../Resources/winemag-data-130k-v2.csv")

In [186]:
#Create groups for points
conditions = [
    (df.points>=90),
    (df.points>=85),
    (df.points>=80)
    ]

# create a list of the values we want to assign for each condition
values = ['90-95', '85-90', '80-85']

# create a new column and use np.select to assign values to it using our lists as arguments
df['points_group']=np.select(conditions, values)
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_one,region_two,taster_name,taster_twitter_handle,title,variety,winery,points_group
0,0,Italy,Aromas include tropical fruit broom brimstone ...,Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,85-90
1,1,Portugal,This is ripe and fruity a wine that is smooth ...,Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,85-90
2,2,US,Tart and snappy the flavors of lime flesh and ...,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,85-90
3,3,US,Pineapple rind lemon pith and orange blossom s...,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,85-90
4,4,US,Much like the regular bottling from 2012 this ...,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,85-90


In [170]:
# Filter needed columns
df_filtered = df[['description', 'variety','country','winery']]
df_filtered.head()

Unnamed: 0,description,variety,country,winery
0,Aromas include tropical fruit broom brimstone ...,White Blend,Italy,Nicosia
1,This is ripe and fruity a wine that is smooth ...,Portuguese Red,Portugal,Quinta dos Avidagos
2,Tart and snappy the flavors of lime flesh and ...,Pinot Gris,US,Rainstorm
3,Pineapple rind lemon pith and orange blossom s...,Riesling,US,St. Julian
4,Much like the regular bottling from 2012 this ...,Pinot Noir,US,Sweet Cheeks


In [171]:
# Drop NAs and duplicates
df_filtered = df_filtered.dropna(how='any')
df_filtered=df_filtered.drop_duplicates()
df_filtered.head()

Unnamed: 0,description,variety,country,winery
0,Aromas include tropical fruit broom brimstone ...,White Blend,Italy,Nicosia
1,This is ripe and fruity a wine that is smooth ...,Portuguese Red,Portugal,Quinta dos Avidagos
2,Tart and snappy the flavors of lime flesh and ...,Pinot Gris,US,Rainstorm
3,Pineapple rind lemon pith and orange blossom s...,Riesling,US,St. Julian
4,Much like the regular bottling from 2012 this ...,Pinot Noir,US,Sweet Cheeks


In [183]:
# Split the data into train and test
X = df_filtered['description']
y = df_filtered['variety']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [173]:
# train_data = df_filtered.loc[df_filtered['description'].isin(X_train)]
# test_data = df_filtered.loc[df_filtered['description'].isin(X_test)]
# train_data = train_data.drop_duplicates()
# test_data = test_data.drop_duplicates()

In [174]:
# Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, 
# which builds a dictionary of features and transforms documents to feature vectors:
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)

# # Train data:
# description_train_counts = count_vect.fit_transform(train_data.description)
# winery_train_counts = count_vect.fit_transform(train_data.winery)
# country_train_counts = count_vect.fit_transform(train_data.country)

# # Test data:
# description_test_counts = count_vect.fit_transform(test_data.description)
# winery_test_counts = count_vect.fit_transform(test_data.winery)
# country_test_counts = count_vect.fit_transform(test_data.country)

X_train_counts.shape

(89939, 36131)

In [175]:
# A dictionary of feature indices
count_vect.vocabulary_.get(u'algorithm')
count_vect.vocabulary_

{'unfiltered': 33673,
 'this': 32241,
 'is': 16699,
 'an': 2004,
 'invitingly': 16639,
 'earthy': 10684,
 'wine': 35532,
 'marked': 19440,
 'by': 5380,
 'truffle': 33153,
 'and': 2033,
 'forest': 12900,
 'with': 35612,
 'generous': 13866,
 'concentration': 7903,
 'of': 22198,
 'black': 3959,
 'cherry': 6626,
 'that': 32145,
 'impacted': 16127,
 'quite': 25597,
 'bit': 3916,
 'oak': 22015,
 'velvety': 34292,
 'smooth': 29324,
 'it': 16722,
 'substantial': 30838,
 'on': 22366,
 'the': 32148,
 'palate': 23036,
 'blackberry': 3970,
 'flavor': 12582,
 'juicy': 17073,
 'character': 6422,
 'has': 15173,
 'bright': 4874,
 'fresh': 13190,
 'feel': 12129,
 'soft': 29434,
 'tannins': 31725,
 'blanc': 4084,
 'de': 9347,
 'noirs': 21711,
 'uses': 33985,
 'only': 22418,
 'grapes': 14501,
 'can': 5640,
 'be': 3389,
 'powerful': 24758,
 'tannic': 31708,
 'edge': 10788,
 'case': 5991,
 'here': 15485,
 'full': 13482,
 'citrus': 7077,
 'zest': 36027,
 'appleskin': 2308,
 'texture': 32118,
 'tight': 32404

In [176]:
# Use the 'fit' to fit our estimator to the data and 
# secondly the 'transform' to transform our count-matrix to a tf-idf representation

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

# # Train data
# tf_transformer_description = TfidfTransformer(use_idf=False).fit(description_train_counts)
# description_train_tf = tf_transformer_description.transform(description_train_counts)

# tf_transformer_winery = TfidfTransformer(use_idf=False).fit(winery_train_counts)
# winery_train_tf = tf_transformer_winery.transform(winery_train_counts)

# tf_transformer_country = TfidfTransformer(use_idf=False).fit(country_train_counts)
# country_train_tf = tf_transformer_country.transform(country_train_counts)

# # Test data:
# tf_transformer_description_test = TfidfTransformer(use_idf=False).fit(description_test_counts)
# description_test_tf = tf_transformer_description_test.transform(description_test_counts)

# tf_transformer_winery_test = TfidfTransformer(use_idf=False).fit(winery_test_counts)
# winery_test_tf = tf_transformer_winery_test.transform(winery_train_counts)

# tf_transformer_country_test = TfidfTransformer(use_idf=False).fit(country_test_counts)
# country_test_tf = tf_transformer_country_test.transform(country_test_counts)

X_train_tf.shape

(89939, 36131)

In [177]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# # Train data:
# description_train_tfidf = tfidf_transformer.fit_transform(description_train_counts)
# winery_train_tfidf = tfidf_transformer.fit_transform(winery_train_counts)
# country_train_tfidf = tfidf_transformer.fit_transform(country_train_counts)

# # Test data:
# description_test_tfidf = tfidf_transformer.fit_transform(description_test_counts)
# winery_test_tfidf = tfidf_transformer.fit_transform(winery_test_counts)
# country_test_tfidf = tfidf_transformer.fit_transform(country_test_counts)

X_train_tfidf.shape

(89939, 36131)

In [140]:
# from scipy.sparse import hstack

# # Train data:
# X_train_dtm = hstack((description_train_tfidf,winery_train_tfidf))
# X_train_dtm = hstack((X_train_dtm,country_train_tfidf))

# # Test data:
# X_test_dtm = hstack((description_test_tfidf,winery_test_tfidf))
# X_test_dtm = hstack((X_test_dtm,country_test_tfidf))

# X_train_dtm

In [178]:
# Train the classifier
clf = MultinomialNB().fit(X_train_tfidf,y_train)

In [179]:
# Test the model with an input
data = ['Fruity','Dry']
X_new_counts = count_vect.transform(data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
predicted


array(['Chardonnay', 'Pinot Noir'], dtype='<U35')

In [180]:
# Accuracy of the model
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted_test = clf.predict(X_test_tfidf)
np.mean(predicted_test == y_test)

0.3332555036691127

In [181]:
# Save model

vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))

NBModel = 'sentiment_scoring.sklearn'
pickle.dump(clf, open(NBModel, 'wb'))

In [182]:
# Reusing model
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))
nbModel = pickle.load(open(NBModel, 'rb'))

user_input=['fruity','apple','dry','chocolate','red','italy']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = nbModel.predict(X_new)
print(result)

['Chardonnay' 'Chardonnay' 'Pinot Noir' 'Cabernet Sauvignon' 'Pinot Noir'
 'Red Blend']
