In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import urllib, json
import string

## NLTK basics
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## Decision trees
from sklearn import tree

## Random Forest
from sklearn.ensemble import RandomForestClassifier

## knn
from sklearn.neighbors import KNeighborsClassifier

## SVC
from sklearn.svm import SVC
from sklearn.metrics import classification_report

## kmeans
from sklearn.cluster import KMeans

## neural network
import tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import random

## grid search
from sklearn.model_selection import GridSearchCV


In [3]:
df = pd.read_csv("Resources/winemag-data-130k-v2.csv")

variety_list = pd.read_csv("Resources/Wine_varieties.csv", index=False)

variety_list = variety_list.variety.tolist()

subset = df.loc[df['variety'].isin(variety_list)]

subset = subset[['country', 'description', 'points', 'province', 'variety']]

subset.head()

Unnamed: 0,country,description,points,province,variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir


In [61]:
#Split reviews into words
tokenizer = nltk.RegexpTokenizer(r'\w+')
tokens = subset["description"].apply(tokenizer.tokenize)

#Remove useless stuff
stop_words = stopwords.words('english')
adj_list = []

for synset in list(wn.all_synsets(wn.ADJ)):
    s = synset.name()
    lemma = s.split('.')[0]
    adj_list.append(lemma)

for item in tokens:
    for r in item:
        if (r.lower() not in adj_list) and (r.lower() in stop_words): 
            item.remove(r)
            
#Add split text lists to df
subset["description_split"] = tokens

subset.to_csv('Resources/Wine_Reviews_ML.csv', index=False)

subset.head()


Unnamed: 0,country,description,points,province,variety,description_split
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"[Aromas, include, tropical, fruit, broom, brim..."
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red,"[is, ripe, fruity, wine, is, smooth, still, st..."
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris,"[Tart, snappy, flavors, lime, flesh, rind, dom..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling,"[Pineapple, rind, lemon, pith, orange, blossom..."
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir,"[Much, like, regular, bottling, 2012, comes, a..."


In [113]:
descriptors = subset.description_split.tolist()

#Create empty dict
descriptors_dict = {}
temp_list = []

#Iterate!
for sublist in descriptors:
    temp_list.extend(sublist)

d = FreqDist(temp_list)
descriptors_dict = dict((k, v) for k, v in d.items() if (k.lower() in adj_list) and (k.lower() not in stop_words))

descriptors_dict


{'tropical': 3604,
 'dried': 7773,
 'expressive': 597,
 'sage': 1748,
 'brisk': 1887,
 'ripe': 25293,
 'fruity': 9193,
 'smooth': 6566,
 'still': 5302,
 'structured': 5109,
 'Firm': 673,
 'filled': 505,
 'juicy': 9120,
 'red': 20745,
 'drinkable': 827,
 'better': 1552,
 'snappy': 546,
 'green': 9327,
 'crisp': 11821,
 'stainless': 687,
 'orange': 5627,
 'honey': 3982,
 'astringent': 1304,
 'Much': 61,
 'like': 7762,
 'regular': 356,
 'rough': 711,
 'tannic': 6254,
 'earthy': 4420,
 'herbal': 5911,
 'good': 8707,
 'hearty': 714,
 'typical': 879,
 'full': 14825,
 'bodied': 11542,
 'dark': 11113,
 'fresh': 15909,
 'bright': 9830,
 'informal': 330,
 'candied': 2337,
 'white': 12545,
 'savory': 4880,
 'balanced': 8191,
 'soft': 12179,
 'dry': 15823,
 'restrained': 1003,
 'Balanced': 306,
 'firm': 8547,
 'much': 2870,
 'Savory': 337,
 'preserved': 151,
 'elegant': 4753,
 'sprightly': 237,
 'great': 4697,
 'Soft': 1453,
 'supported': 615,
 'complete': 502,
 'strong': 2732,
 'attractive': 3600

In [None]:
subset = subset[['country', 'description', 'points', 'province', 'variety', 'description_split']]

for key in descriptors_dict:
    subset[key] = 0

for key in descriptors_dict:
    for index, row in subset.iterrows():
        subset[key][index] += subset['description_split'][index].count(key)
        
# test['Aromas'][0] += test['description_split'][0].count('Aromas')
#     subset[key] = subset['description_split'].count(key)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [147]:
subset

0     1
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Name: Aromas, Length: 100, dtype: int64

In [75]:
# Set features. This will also be used as your x values.
X = subset.drop(["description", "variety"], axis=1)
y = subset["variety"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,country,points,province,description_split
56921,Italy,87,Northeastern Italy,"[Moscato, Giallo, remarkable, ability, appeari..."
7894,France,84,Southwest France,"[is, typically, light, fruity, wine, Gascony, ..."
24745,Chile,85,Cachapoal Valley,"[Sharp, aromas, prickly, red, fruits, barrel, ..."
9828,Australia,89,South Australia,"[Cassis, chocolate, notes, dominate, there, en..."
124193,Austria,94,Wachau,"[nose, opens, totally, tangy, note, lemon, zes..."


{'remarkable': 130,
 'sweet': 9596,
 'honey': 2970,
 'yellow': 3148,
 'rose': 1890,
 'dry': 11721,
 'polished': 1550,
 'clean': 4068,
 'crisp': 8827,
 'close': 1634,
 'light': 8269,
 'fruity': 6913,
 'fresh': 11912,
 'green': 7039,
 'Sharp': 118,
 'red': 15621,
 'lively': 3063,
 'earthy': 3298,
 'minty': 741,
 'unresolved': 26,
 'like': 5820,
 'framed': 922,
 'firm': 6389,
 'dusty': 1566,
 'long': 4665,
 'Multiple': 5,
 'U': 42,
 'similar': 282,
 'generous': 2454,
 'fleshy': 929,
 'juicy': 6748,
 'orange': 4232,
 'tangerine': 1585,
 'streamlined': 255,
 'countless': 5,
 'concentrated': 4424,
 'brisk': 1410,
 'ripe': 19047,
 'ultimate': 23,
 'Crisp': 712,
 'pink': 1239,
 'Energizing': 12,
 'pristine': 267,
 'Made': 2266,
 'Tuscan': 152,
 'dried': 5777,
 'candied': 1723,
 'toasted': 2142,
 'seasoned': 148,
 'one': 3682,
 'chalky': 745,
 'right': 1678,
 'Smoky': 295,
 'powerful': 2004,
 'soft': 9104,
 'black': 20760,
 'full': 11173,
 'bodied': 8612,
 'becoming': 115,
 'back': 1087,
 'stra

KeyError: 'Requested level (remarkable) does not match index name (None)'

In [111]:
X_train = X_train[['country', 'points', 'province', 'description_split']]

X_train

Unnamed: 0,country,points,province,description_split
56921,Italy,87,Northeastern Italy,"[Moscato, Giallo, remarkable, ability, appeari..."
7894,France,84,Southwest France,"[is, typically, light, fruity, wine, Gascony, ..."
24745,Chile,85,Cachapoal Valley,"[Sharp, aromas, prickly, red, fruits, barrel, ..."
9828,Australia,89,South Australia,"[Cassis, chocolate, notes, dominate, there, en..."
124193,Austria,94,Wachau,"[nose, opens, totally, tangy, note, lemon, zes..."
...,...,...,...,...
128343,Spain,88,Catalonia,"[Schisty, red, fruit, aromas, come, hints, van..."
103876,US,88,New York,"[initially, closed, suggesting, bit, more, bra..."
860,US,90,California,"[Fresh, cracked, peppercorn, dense, black, pur..."
15820,US,85,California,"[little, one, dimensional, in, apricot, citrus..."


In [None]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)