In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import wordnet as wn

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## Random Forest
from sklearn.ensemble import RandomForestClassifier

## knn
from sklearn.neighbors import KNeighborsClassifier

## pickle
import pickle
from collections import Counter

In [54]:
wine_class = pd.read_csv('Resources/Wine_varieties_classified.csv')
del wine_class['Unnamed: 0']
wine_class['Red'] = wine_class['is_red'] + 1

wine_class = wine_class[['variety', 'is_white', 'Red']]
wine_class.columns = ['variety', 'White', 'Red']

wine_class = wine_class.fillna(0)

wine_class.head()

Unnamed: 0,variety,White,Red
0,Agiorgitiko,0.0,1.0
1,Aglianico,0.0,1.0
2,Alvarinho,1.0,0.0
3,Arneis,1.0,0.0
4,Assyrtiko,1.0,0.0


In [55]:
wines_to_keep = wine_class.variety.tolist()

In [56]:
full_df = pd.read_csv('Resources/Wine_Reviews_ML.csv')
full_df.head()

Unnamed: 0,country,description,points,province,variety,description_split
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br..."
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red,"['is', 'ripe', 'fruity', 'wine', 'is', 'smooth..."
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris,"['Tart', 'snappy', 'flavors', 'lime', 'flesh',..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling,"['Pineapple', 'rind', 'lemon', 'pith', 'orange..."
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir,"['Much', 'like', 'regular', 'bottling', '2012'..."


In [57]:
subset = full_df.loc[full_df.variety.isin(wines_to_keep)]

subset.head()

Unnamed: 0,country,description,points,province,variety,description_split
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling,"['Pineapple', 'rind', 'lemon', 'pith', 'orange..."
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir,"['Much', 'like', 'regular', 'bottling', '2012'..."
6,Italy,"Here's a bright, informal red that opens with ...",87,Sicily & Sardinia,Frappato,"['Here', 'a', 'bright', 'informal', 'red', 'op..."
10,US,"Soft, supple plum envelopes an oaky structure ...",87,California,Cabernet Sauvignon,"['Soft', 'supple', 'plum', 'envelopes', 'oaky'..."


In [58]:
merge_df = subset.merge(wine_class, on='variety', suffixes = ("",""))

merge_df.head()

Unnamed: 0,country,description,points,province,variety,description_split,White,Red
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,0.0
1,Italy,Delicate aromas recall white flower and citrus...,87,Sicily & Sardinia,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,0.0
2,Italy,Pretty aromas of yellow flower and stone fruit...,87,Sicily & Sardinia,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,0.0
3,Italy,"Part of the extended Calanìca series, this Gri...",86,Sicily & Sardinia,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,0.0
4,Italy,Made predominantly with Trebbiano and Malvasia...,87,Tuscany,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,0.0


In [59]:
adjective_list = ["ripe", "crisp", "mature", "tropical", "rich", "sweet", "fresh", "honeyed", "fruity", "smooth", 
                  "soft", "bright", "dry", "earthy", "rubbery", "savory", "vanilla", "bitter", "intense", 
                  "traditional", "nutty"]

refined_dict = {}

for item in adjective_list:
    synonym = wn.synsets(item, pos=wn.ADJ)
    temp_list = []
    for lemma in synonym:
        l = lemma.name()
        l = l.split(".")[0]
        if l not in temp_list:
            temp_list.append(l)
            
    refined_dict[item] = temp_list
    
refined_dict

{'ripe': ['ripe', 'good', 'advanced'],
 'crisp': ['crisp'],
 'mature': ['mature', 'ripe', 'fledged'],
 'tropical': ['tropical'],
 'rich': ['rich', 'fat', 'deep', 'full-bodied', 'ample'],
 'sweet': ['sweet',
  'angelic',
  'dulcet',
  'gratifying',
  'odoriferous',
  'fresh',
  'sugared'],
 'fresh': ['fresh', 'bracing', 'clean'],
 'honeyed': ['honeyed', 'dulcet'],
 'fruity': ['fruity', 'balmy'],
 'smooth': ['smooth', 'politic', 'fluent', 'legato', 'placid'],
 'soft': ['soft',
  'delicate',
  'voiced',
  'piano',
  'indulgent',
  'gentle',
  'easy',
  'cushy',
  'balmy'],
 'bright': ['bright', 'undimmed'],
 'dry': ['dry'],
 'earthy': ['crude', 'earthy', 'down-to-earth'],
 'rubbery': ['rubbery', 'cartilaginous'],
 'savory': ['savory', 'piquant', 'mouth-watering'],
 'vanilla': ['vanilla'],
 'bitter': ['acrimonious', 'bitter', 'acerb', 'biting'],
 'intense': ['intense', 'acute'],
 'traditional': ['traditional'],
 'nutty': ['nutty', 'balmy']}

In [60]:
for item in adjective_list:
    merge_df[item] = 0

for item in adjective_list:
    syn_list = refined_dict[item]
    for index, row in merge_df.iterrows():
        for syn in syn_list:
            merge_df[item][index] += merge_df['description_split'][index].count(syn)
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [62]:
merge_df = merge_df.drop(["description", "description_split"], axis=1)

merge_df.to_csv('Resources/Filtered_ML.csv', index=False)

merge_df[['country', 'province']].groupby(['country', 'province']).count().reset_index().to_csv('Resources/Country_Province.csv', index=False)



In [160]:
merge_df = pd.read_csv('Resources/Filtered_ML.csv')
merge_df.head()

Unnamed: 0,country,points,province,variety,White,Red,ripe,crisp,mature,tropical,...,bright,dry,earthy,rubbery,savory,vanilla,bitter,intense,traditional,nutty
0,Italy,87,Sicily & Sardinia,White Blend,1.0,0.0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Italy,87,Sicily & Sardinia,White Blend,1.0,0.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,Italy,87,Sicily & Sardinia,White Blend,1.0,0.0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
3,Italy,86,Sicily & Sardinia,White Blend,1.0,0.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Italy,87,Tuscany,White Blend,1.0,0.0,2,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
merge_df = pd.get_dummies(merge_df, columns=['country','province'], prefix=None)

def f(row):
    if (row['points'] <= 100) and (row['points'] >= 95):
        val = 4
    elif (row['points'] < 95) and (row['points'] >= 90):
        val = 3
    elif (row['points'] < 90) and (row['points'] >= 85):
        val = 2
    else:
        val = 1
    return val

merge_df['points_grouped'] = merge_df.apply(f, axis=1)

merge_df.head()

Unnamed: 0,points,variety,White,Red,ripe,crisp,mature,tropical,rich,sweet,...,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,points_grouped
0,87,White Blend,1.0,0.0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2
1,87,White Blend,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,87,White Blend,1.0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,86,White Blend,1.0,0.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,87,White Blend,1.0,0.0,2,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [86]:
merge_df = merge_df.drop(["points"], axis=1)

merge_df.to_csv("Resources/Filtered_ML_matrix_forVar.csv", index=False)

In [132]:
merge_df = pd.read_csv("Resources/Filtered_ML_matrix_forVar.csv")

merge_df.head()

Unnamed: 0,variety,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,...,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,points_grouped
0,White Blend,1.0,0.0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,White Blend,1.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,White Blend,1.0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,White Blend,1.0,0.0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,White Blend,1.0,0.0,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [133]:
# Set features. This will also be used as your x values.
X = merge_df.drop(["variety"], axis=1)
y = merge_df["variety"]

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,...,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,points_grouped
23984,0.0,1.0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,2
12897,0.0,1.0,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
27795,0.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
27344,0.0,1.0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
64132,0.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [134]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [135]:
rf = RandomForestClassifier(n_estimators=400)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.57083313598257

In [94]:
rf_predict = rf.predict(X_test_scaled)

In [96]:
rf_actual = y_test

actual = np.array(rf_actual)
actual_df = pd.DataFrame(rf_actual)
actual_df["Predicted"] = rf_predict
comparison = actual_df.rename(columns = {0: "Actual"})
comparison.head(10)

Unnamed: 0,variety,Predicted
39750,Chardonnay,Chardonnay
24369,Cabernet Sauvignon,Cabernet Sauvignon
2901,Riesling,Chardonnay
42881,Malbec,Pinot Noir
71402,Zinfandel,Cabernet Sauvignon
5034,Riesling,Riesling
29238,Cabernet Sauvignon,Pinot Noir
3559,Riesling,Riesling
74252,Nebbiolo,Nebbiolo
49968,Red Blend,Red Blend


In [98]:
# Save model

filename = 'variety_rf.sklearn'
pickle.dump(rf, open(filename, 'wb'))

In [99]:
merge_df['White'] = merge_df['White'].map({1: 'White', 0: 'Red'})
merge_df = merge_df.rename(columns={'White': 'category'})

del merge_df['Red']

merge_df.head()

Unnamed: 0,variety,category,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,...,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,points_grouped
0,White Blend,White,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,White Blend,White,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,White Blend,White,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,White Blend,White,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,White Blend,White,2,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [100]:
merge_df = pd.get_dummies(merge_df, columns=['variety'], prefix=None)

merge_df.to_csv("Resources/Filtered_ML_matrix_forCat.csv", index=False)


In [101]:
merge_df = pd.read_csv("Resources/Filtered_ML_matrix_forCat.csv")

merge_df.head()

Unnamed: 0,category,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,fruity,...,variety_Verdejo,variety_Verdicchio,variety_Vermentino,variety_Vernaccia,variety_Vidal Blanc,variety_Vilana,variety_Viognier,variety_Viura,variety_White Blend,variety_Zinfandel
0,White,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,White,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,White,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,White,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,White,2,0,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [102]:
X = merge_df.drop(["category"], axis=1)
y = merge_df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,fruity,smooth,...,variety_Verdejo,variety_Verdicchio,variety_Vermentino,variety_Vernaccia,variety_Vidal Blanc,variety_Vilana,variety_Viognier,variety_Viura,variety_White Blend,variety_Zinfandel
23984,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12897,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27344,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [105]:
rf = RandomForestClassifier(n_estimators=400)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9992421730687254

In [106]:
rf_predict = rf.predict(X_test_scaled)

In [107]:
rf_actual = y_test

actual = np.array(rf_actual)
actual_df = pd.DataFrame(rf_actual)
actual_df["Predicted"] = rf_predict
comparison = actual_df.rename(columns = {0: "Actual"})
comparison.head(10)

Unnamed: 0,category,Predicted
39750,White,White
24369,Red,Red
2901,White,White
42881,Red,Red
71402,Red,Red
5034,White,White
29238,Red,Red
3559,White,White
74252,Red,Red
49968,Red,Red


In [108]:
# Save model

filename = 'category_rf.sklearn'
pickle.dump(rf, open(filename, 'wb'))

In [158]:
input_list = ['ripe', 'crisp', 'soft', 'US', 'Washington', 'White']

points_grouped = 4

input_dict = {}

for name in X.columns:
    if name in input_list:
        input_dict[name] = [1]
    else:
        input_dict[name] = [0]
        
for item in input_dict['points_grouped']:
    item += points_grouped


input_dict

{'White': [1],
 'Red': [0],
 'ripe': [1],
 'crisp': [1],
 'mature': [0],
 'tropical': [0],
 'rich': [0],
 'sweet': [0],
 'fresh': [0],
 'honeyed': [0],
 'fruity': [0],
 'smooth': [0],
 'soft': [1],
 'bright': [0],
 'dry': [0],
 'earthy': [0],
 'rubbery': [0],
 'savory': [0],
 'vanilla': [0],
 'bitter': [0],
 'intense': [0],
 'traditional': [0],
 'nutty': [0],
 'country_Argentina': [0],
 'country_Australia': [0],
 'country_Austria': [0],
 'country_Brazil': [0],
 'country_Bulgaria': [0],
 'country_Canada': [0],
 'country_Chile': [0],
 'country_Croatia': [0],
 'country_Cyprus': [0],
 'country_Czech Republic': [0],
 'country_Egypt': [0],
 'country_England': [0],
 'country_France': [0],
 'country_Georgia': [0],
 'country_Germany': [0],
 'country_Greece': [0],
 'country_Hungary': [0],
 'country_India': [0],
 'country_Israel': [0],
 'country_Italy': [0],
 'country_Lebanon': [0],
 'country_Luxembourg': [0],
 'country_Macedonia': [0],
 'country_Mexico': [0],
 'country_Moldova': [0],
 'country_M

In [142]:
test_df = pd.DataFrame.from_dict(input_dict, orient='columns')

test_df.head()

Unnamed: 0,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,...,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,points_grouped
0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
test_scaled = X_scaler.transform(test_df)

In [144]:
rf_predict = rf.predict(test_scaled)

rf_predict

array(['Chardonnay'], dtype=object)