In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import wordnet as wn

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import pydot

## pickle
import pickle
from pickle import dump
import json

# Load Angelica's Wine Category Dataset

In [None]:
wine_class = pd.read_csv('Resources/Wine_varieties_classified.csv')
del wine_class['Unnamed: 0']
wine_class['Red'] = wine_class['is_red'] + 1

wine_class = wine_class[['variety', 'is_white', 'Red']]
wine_class.columns = ['variety', 'White', 'Red']

wine_class = wine_class.fillna(0)

wine_class.head()

# Convert wine variety to list to filter main dataset

In [None]:
wines_to_keep = wine_class.variety.tolist()

# Load the Reviews Dataset

In [None]:
full_df = pd.read_csv('Resources/Wine_Reviews_ML.csv')
full_df.head()

# Filter the dataset by the list of wine varieties

In [None]:
subset = full_df.loc[full_df.variety.isin(wines_to_keep)]

subset.head()

# Merge datasets and save a dictionary of country and provinces as json

In [None]:
merge_df = subset.merge(wine_class, on='variety', suffixes = ("",""))

countries = merge_df.country.tolist()

country_province = {}

for country in countries:
    temp_df = merge_df.loc[merge_df.country == country]
    provinces = temp_df.province.tolist()
    provinces_cleaned = []
    for province in provinces:
        if province not in provinces_cleaned:
            provinces_cleaned.append(province)
    country_province[country] = provinces_cleaned
    
with open('Resources/Country_Province.json', 'w') as fp:
    json.dump(country_province, fp)

merge_df.head()

# Get synonym list of most frequent adjectives from Wordnet

In [None]:
adjective_list = ["ripe", "crisp", "mature", "tropical", "rich", "sweet", "fresh", "honeyed", "fruity", "smooth", 
                  "soft", "bright", "dry", "earthy", "rubbery", "savory", "vanilla", "bitter", "intense", 
                  "traditional", "nutty"]

refined_dict = {}

for item in adjective_list:
    synonym = wn.synsets(item, pos=wn.ADJ)
    temp_list = []
    for lemma in synonym:
        l = lemma.name()
        l = l.split(".")[0]
        if l not in temp_list:
            temp_list.append(l)
            
    refined_dict[item] = temp_list
    
refined_dict

# Iterate through dataframe rows to create columns for each adjective on the list and record frequency for that adjective or its synonym

In [None]:
for item in adjective_list:
    merge_df[item] = 0

for item in adjective_list:
    syn_list = refined_dict[item]
    for index, row in merge_df.iterrows():
        for syn in syn_list:
            merge_df[item][index] += merge_df['description_split'][index].count(syn)
        

# Drop description rows now that we have frequencies

In [None]:
merge_df = merge_df.drop(["description", "description_split"], axis=1)

merge_df = pd.get_dummies(merge_df, columns=['country','province'], prefix='', prefix_sep='')

def f(row):
    if (row['points'] <= 100) and (row['points'] >= 95):
        val = 4
    elif (row['points'] < 95) and (row['points'] >= 90):
        val = 3
    elif (row['points'] < 90) and (row['points'] >= 85):
        val = 2
    else:
        val = 1
    return val

merge_df['points_grouped'] = merge_df.apply(f, axis=1)

merge_df = merge_df.drop(["points"], axis=1)

merge_df.to_csv("Resources/Filtered_ML_matrix_forVar.csv", index=False)

merge_df.head()

In [2]:
##Reopen DF

merge_df = pd.read_csv('Resources/Filtered_ML_matrix_forVar.csv')
merge_df.head()

Unnamed: 0,variety,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,...,Western Australia,Western Cape,Wiener Gemischter Satz,Württemberg,Zenata,Österreichischer Perlwein,Österreichischer Sekt,Štajerska,Župa,points_grouped
0,White Blend,1.0,0.0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,White Blend,1.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,White Blend,1.0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,White Blend,1.0,0.0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,White Blend,1.0,0.0,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


# Start machine learning!

## First, wine variety

In [3]:
##Set features. This will also be used as your x values.
X = merge_df.drop(["variety"], axis=1)
y = merge_df["variety"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,...,Western Australia,Western Cape,Wiener Gemischter Satz,Württemberg,Zenata,Österreichischer Perlwein,Österreichischer Sekt,Štajerska,Župa,points_grouped
23984,0.0,1.0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,2
12897,0.0,1.0,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
27795,0.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
27344,0.0,1.0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
64132,0.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [4]:
##Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
##Random Forests
rf = RandomForestClassifier(n_estimators=400)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.5710699568985933

In [6]:
##Predict
rf_predict = rf.predict(X_test_scaled)

In [7]:
##Compare to actual
rf_actual = y_test

actual = np.array(rf_actual)
actual_df = pd.DataFrame(rf_actual)
actual_df["Predicted"] = rf_predict
comparison = actual_df.rename(columns = {0: "Actual"})
comparison.head(10)

Unnamed: 0,variety,Predicted
39750,Chardonnay,Chardonnay
24369,Cabernet Sauvignon,Cabernet Sauvignon
2901,Riesling,Chardonnay
42881,Malbec,Pinot Noir
71402,Zinfandel,Cabernet Sauvignon
5034,Riesling,Riesling
29238,Cabernet Sauvignon,Pinot Noir
3559,Riesling,Riesling
74252,Nebbiolo,Nebbiolo
49968,Red Blend,Red Blend


In [None]:
##Visualize

estimator = rf.estimators_[5]

# Export as dot file
export_graphviz(estimator, out_file='tree_var.dot', 
                feature_names = X.columns,
                rounded = True, precision = 1)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree_var.dot', '-o', 'tree_var.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree_var.png')

In [None]:
##Save model

filename = 'variety_rf.h5'
pickle.dump(rf, open(filename, 'wb'))

##Save the scaler
pickle.dump(X_scaler, open('X_scaler.h5', 'wb'))

## Predict with input data

In [12]:
variables = X.columns.tolist()

variables.remove('points_grouped')

input_list = ['dry', 'crisp', 'White', 'France']

points_grouped = 4

input_dict = {}

for name in variables:
    if name in input_list:
        input_dict[name] = [1]
    else:
        input_dict[name] = [0]
        
input_dict['points_grouped'] = [points_grouped]

In [13]:
test_df = pd.DataFrame.from_dict(input_dict, orient='columns')

test_df.head()

Unnamed: 0,White,Red,ripe,crisp,mature,tropical,rich,sweet,fresh,honeyed,...,Western Australia,Western Cape,Wiener Gemischter Satz,Württemberg,Zenata,Österreichischer Perlwein,Österreichischer Sekt,Štajerska,Župa,points_grouped
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [14]:
test_scaled = X_scaler.transform(test_df)

In [15]:
rf_predict = rf.predict(test_scaled)

rf_predict

array(['Chardonnay'], dtype=object)

In [None]:
merge_df['White'] = merge_df['White'].map({1: 'White', 0: 'Red'})
merge_df = merge_df.rename(columns={'White': 'category'})

del merge_df['Red']

merge_df = pd.get_dummies(merge_df, columns=['variety'], prefix=None)

merge_df.to_csv("Resources/Filtered_ML_matrix_forCat.csv", index=False)

merge_df = pd.read_csv("Resources/Filtered_ML_matrix_forCat.csv")

merge_df.head()

In [None]:
##Set features

X = merge_df.drop(["category"], axis=1)
y = merge_df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

In [None]:
##Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
rf = RandomForestClassifier(n_estimators=400)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
##Predict

rf_predict = rf.predict(X_test_scaled)

In [None]:
##Compare to actual
rf_actual = y_test

actual = np.array(rf_actual)
actual_df = pd.DataFrame(rf_actual)
actual_df["Predicted"] = rf_predict
comparison = actual_df.rename(columns = {0: "Actual"})
comparison.head(10)

In [None]:
rf_predict = rf.predict(X_test_scaled)

In [None]:
rf_actual = y_test

actual = np.array(rf_actual)
actual_df = pd.DataFrame(rf_actual)
actual_df["Predicted"] = rf_predict
comparison = actual_df.rename(columns = {0: "Actual"})
comparison.head(10)