In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## Decision trees
from sklearn import tree

## Random Forest
from sklearn.ensemble import RandomForestClassifier

## knn
from sklearn.neighbors import KNeighborsClassifier


In [12]:
wine_class = pd.read_csv('Resources/Wine_varieties_classified.csv')
del wine_class['Unnamed: 0']
wine_class['Red'] = wine_class['is_red'] + 1

wine_class = wine_class[['variety', 'is_white', 'Red']]
wine_class.columns = ['variety', 'White', 'Red']

wine_class.fillna(0)

Unnamed: 0,variety,White,Red
0,Agiorgitiko,0.0,1.0
1,Aglianico,0.0,1.0
2,Alvarinho,1.0,0.0
3,Arneis,1.0,0.0
4,Assyrtiko,1.0,0.0
...,...,...,...
86,Vilana,1.0,0.0
87,Viognier,1.0,0.0
88,Viura,1.0,0.0
89,White Blend,1.0,0.0


In [5]:
wines_to_keep = wine_class.variety.tolist()
wines_to_keep

['Agiorgitiko',
 'Aglianico',
 'Alvarinho',
 'Arneis',
 'Assyrtiko',
 'Barbera',
 'Bonarda',
 'Brachetto',
 'Cabernet Franc',
 'Cabernet Sauvignon',
 'Cannonau',
 'Carignan',
 'Carricante',
 'Catarratto',
 'Chambourcin',
 'Charbono',
 'Chardonnay',
 'Chenin Blanc',
 'Cinsault',
 'Coda di Volpe',
 'Cortese',
 'Corvina',
 'Dolcetto',
 'Dornfelder',
 'Falanghina',
 'Feteasca Neagra',
 'Fiano',
 'Frappato',
 'Friulano',
 'Furmint',
 'Gaglioppo',
 'Gamay',
 'Garganega',
 'Godello',
 'Graciano',
 'Grenache',
 'Inzolia',
 'Lagrein',
 'Macabeo',
 'Malbec',
 'Malvasia',
 'Marsanne',
 'Merlot',
 'Monica',
 'Montepulciano',
 'Nebbiolo',
 'Negroamaro',
 'Nerello Mascalese',
 "Nero d'Avola",
 'Norton',
 'Pecorino',
 'Petit Verdot',
 'Petite Sirah',
 'Picpoul',
 'Piedirosso',
 'Pinot Blanc',
 'Pinot Meunier',
 'Pinot Noir',
 'Pinotage',
 'Plavac Mali',
 'Primitivo',
 'Red Blend',
 'Refosco',
 'Ribolla Gialla',
 'Riesling',
 'Rkatsiteli',
 'Roditis',
 'Roussanne',
 'Sagrantino',
 'Sangiovese',
 'Sape

In [6]:
full_df = pd.read_csv('Resources/Wine_Reviews_ML.csv')
full_df

Unnamed: 0,country,description,points,province,variety,description_split
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br..."
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,Douro,Portuguese Red,"['is', 'ripe', 'fruity', 'wine', 'is', 'smooth..."
2,US,"Tart and snappy, the flavors of lime flesh and...",87,Oregon,Pinot Gris,"['Tart', 'snappy', 'flavors', 'lime', 'flesh',..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling,"['Pineapple', 'rind', 'lemon', 'pith', 'orange..."
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir,"['Much', 'like', 'regular', 'bottling', '2012'..."
...,...,...,...,...,...,...
129728,Germany,Notes of honeysuckle and cantaloupe sweeten th...,90,Mosel,Riesling,"['Notes', 'honeysuckle', 'cantaloupe', 'sweete..."
129729,US,Citation is given as much as a decade of bottl...,90,Oregon,Pinot Noir,"['Citation', 'given', 'much', 'a', 'decade', '..."
129730,France,Well-drained gravel soil gives this wine its c...,90,Alsace,Gewürztraminer,"['Well', 'drained', 'gravel', 'soil', 'gives',..."
129731,France,"A dry style of Pinot Gris, this is crisp with ...",90,Alsace,Pinot Gris,"['dry', 'style', 'Pinot', 'Gris', 'is', 'crisp..."


In [7]:
subset = full_df.loc[full_df.variety.isin(wines_to_keep)]

subset

Unnamed: 0,country,description,points,province,variety,description_split
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,Michigan,Riesling,"['Pineapple', 'rind', 'lemon', 'pith', 'orange..."
4,US,"Much like the regular bottling from 2012, this...",87,Oregon,Pinot Noir,"['Much', 'like', 'regular', 'bottling', '2012'..."
6,Italy,"Here's a bright, informal red that opens with ...",87,Sicily & Sardinia,Frappato,"['Here', 'a', 'bright', 'informal', 'red', 'op..."
10,US,"Soft, supple plum envelopes an oaky structure ...",87,California,Cabernet Sauvignon,"['Soft', 'supple', 'plum', 'envelopes', 'oaky'..."
...,...,...,...,...,...,...
129723,Italy,"Intense aromas of wild cherry, baking spice, t...",90,Sicily & Sardinia,Frappato,"['Intense', 'aromas', 'wild', 'cherry', 'bakin..."
129724,Italy,"Blackberry, cassis, grilled herb and toasted a...",90,Sicily & Sardinia,Nero d'Avola,"['Blackberry', 'cassis', 'grilled', 'herb', 't..."
129725,Israel,"A bouquet of black cherry, tart cranberry and ...",90,Galilee,Cabernet Sauvignon,"['bouquet', 'black', 'cherry', 'tart', 'cranbe..."
129728,Germany,Notes of honeysuckle and cantaloupe sweeten th...,90,Mosel,Riesling,"['Notes', 'honeysuckle', 'cantaloupe', 'sweete..."


In [13]:
merge_df = subset.merge(wine_class, on='variety', suffixes = ("",""))

merge_df

Unnamed: 0,country,description,points,province,variety,description_split,White,Red
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,
1,Italy,Delicate aromas recall white flower and citrus...,87,Sicily & Sardinia,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,
2,Italy,Pretty aromas of yellow flower and stone fruit...,87,Sicily & Sardinia,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,
3,Italy,"Part of the extended Calanìca series, this Gri...",86,Sicily & Sardinia,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,
4,Italy,Made predominantly with Trebbiano and Malvasia...,87,Tuscany,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,
...,...,...,...,...,...,...,...,...
84447,Greece,This restina balances traditional flavor with ...,90,Retsina,Savatiano,"['restina', 'balances', 'traditional', 'flavor...",1.0,
84448,Greece,This retsina offers a traditional style with i...,84,Attica,Savatiano,"['retsina', 'offers', 'traditional', 'style', ...",1.0,
84449,Greece,The Savatiano-whisperer Papagiannakos continue...,90,Markopoulo,Savatiano,"['Savatiano', 'whisperer', 'Papagiannakos', 'c...",1.0,
84450,Greece,"Crushed thyme, pine resin and lemon start this...",86,Attica,Savatiano,"['Crushed', 'thyme', 'pine', 'resin', 'lemon',...",1.0,


In [15]:
adjective_list = ["ripe", "crisp", "mature", "tropical", "rich", "sweet", "herbal", "full-bodied", "fresh", "exotic",
                  "floral", "honeyed", "fruity", "smooth", "soft", "bright", "vibrant", "elegant", "dry", "earthy",
                  "rubbery", "tannic", "aromatic", "savory", "meaty", "vanilla", "candied", "toasted", "rare", 
                  "delicate", "smoky", "bitter", "ginger", "fragrant", "layered", "intense", "traditional", "nutty",
                  "balanced", "lemony", "citric", "rose", "well-known", "little-known"]

In [16]:
for item in adjective_list:
    merge_df[item] = 0

for item in adjective_list:
    for index, row in merge_df.iterrows():
        merge_df[item][index] += merge_df['description_split'][index].count(item)
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
del merge_df['description']

merge_df.to_csv('Resources/Filtered_ML.csv', index=False)

In [2]:
merge_df = pd.read_csv('Resources/Filtered_ML.csv')
merge_df.head()

Unnamed: 0,country,points,province,variety,description_split,White,Red,ripe,crisp,mature,...,layered,intense,traditional,nutty,balanced,lemony,citric,rose,well-known,little-known
0,Italy,87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Italy,87,Sicily & Sardinia,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Italy,87,Sicily & Sardinia,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Italy,86,Sicily & Sardinia,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Italy,87,Tuscany,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
country = merge_df.groupby('country').sum().index.tolist()

for c in country:
    merge_df[c] = 0
    for index, row in merge_df.iterrows():
        if merge_df['country'][index] == c:
            merge_df[c][index] += 1
    

merge_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,country,points,province,variety,description_split,White,Red,ripe,crisp,mature,...,Serbia,Slovakia,Slovenia,South Africa,Spain,Switzerland,Turkey,US,Ukraine,Uruguay
0,Italy,87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Italy,87,Sicily & Sardinia,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Italy,87,Sicily & Sardinia,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Italy,86,Sicily & Sardinia,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Italy,87,Tuscany,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84447,Greece,90,Retsina,Savatiano,"['restina', 'balances', 'traditional', 'flavor...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84448,Greece,84,Attica,Savatiano,"['retsina', 'offers', 'traditional', 'style', ...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84449,Greece,90,Markopoulo,Savatiano,"['Savatiano', 'whisperer', 'Papagiannakos', 'c...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84450,Greece,86,Attica,Savatiano,"['Crushed', 'thyme', 'pine', 'resin', 'lemon',...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
province = merge_df.groupby('province').sum().index.tolist()

for p in province:
    merge_df[p] = 0
    for index, row in merge_df.iterrows():
        if merge_df['province'][index] == p:
            merge_df[p][index] += 1
    

merge_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [22]:
# Set features. This will also be used as your x values.
merge_df = merge_df.drop(["country", "province", "description_split"], axis=1)
merge_df.to_csv("Filtered_ML_matrix.csv", index=False)



Unnamed: 0,country,points,province,description_split,White,Red,ripe,crisp,mature,tropical,...,layered,intense,traditional,nutty,balanced,lemony,citric,rose,well-known,little-known
23984,US,87,Washington,"['wine', 'displays', 'aromas', 'green', 'herbs...",,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12897,Chile,86,Leyda Valley,"['Cool', 'climate', 'aromas', 'latex', 'rubber...",,1.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27795,US,87,California,"['is', 'appealing', 'Cabernet', 'spicy', 'cola...",,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27344,Chile,92,Maipo Valley,"['Plum', 'currant', 'aromas', 'supported', 'as...",,1.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64132,US,87,California,"['is', 'highly', 'structured', 'wine', 'an', '...",,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = merge_df.drop(["variety"], axis=1)
y = subset["variety"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

In [23]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'US'

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)