In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## Decision trees
from sklearn import tree

## Random Forest
from sklearn.ensemble import RandomForestClassifier

## knn
from sklearn.neighbors import KNeighborsClassifier


In [None]:
wine_class = pd.read_csv('Resources/Wine_varieties_classified.csv')
del wine_class['Unnamed: 0']
wine_class['Red'] = wine_class['is_red'] + 1

wine_class = wine_class[['variety', 'is_white', 'Red']]
wine_class.columns = ['variety', 'White', 'Red']

wine_class.fillna(0)

In [None]:
wines_to_keep = wine_class.variety.tolist()
wines_to_keep

In [None]:
full_df = pd.read_csv('Resources/Wine_Reviews_ML.csv')
full_df

In [None]:
subset = full_df.loc[full_df.variety.isin(wines_to_keep)]

subset

In [None]:
merge_df = subset.merge(wine_class, on='variety', suffixes = ("",""))

merge_df

In [None]:
adjective_list = ["ripe", "crisp", "mature", "tropical", "rich", "sweet", "herbal", "full-bodied", "fresh", "exotic",
                  "floral", "honeyed", "fruity", "smooth", "soft", "bright", "vibrant", "elegant", "dry", "earthy",
                  "rubbery", "tannic", "aromatic", "savory", "meaty", "vanilla", "candied", "toasted", "rare", 
                  "delicate", "smoky", "bitter", "ginger", "fragrant", "layered", "intense", "traditional", "nutty",
                  "balanced", "lemony", "citric", "rose", "well-known", "little-known"]

In [None]:
for item in adjective_list:
    merge_df[item] = 0

for item in adjective_list:
    for index, row in merge_df.iterrows():
        merge_df[item][index] += merge_df['description_split'][index].count(item)
        

In [None]:
del merge_df['description']

merge_df.to_csv('Resources/Filtered_ML.csv', index=False)

In [3]:
merge_df = pd.read_csv('Resources/Filtered_ML.csv')
merge_df.head()

Unnamed: 0,country,points,province,variety,description_split,White,Red,ripe,crisp,mature,...,layered,intense,traditional,nutty,balanced,lemony,citric,rose,well-known,little-known
0,Italy,87,Sicily & Sardinia,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Italy,87,Sicily & Sardinia,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Italy,87,Sicily & Sardinia,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Italy,86,Sicily & Sardinia,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Italy,87,Tuscany,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
merge_df = pd.get_dummies(merge_df, columns=['country','province'], prefix=None)

merge_df.head()

Unnamed: 0,points,variety,description_split,White,Red,ripe,crisp,mature,tropical,rich,...,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Württemberg,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,87,White Blend,"['Aromas', 'include', 'tropical', 'fruit', 'br...",1.0,,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,87,White Blend,"['Delicate', 'aromas', 'recall', 'white', 'flo...",1.0,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,87,White Blend,"['Pretty', 'aromas', 'yellow', 'flower', 'ston...",1.0,,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,86,White Blend,"['Part', 'extended', 'Calanìca', 'series', 'Gr...",1.0,,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,87,White Blend,"['Made', 'predominantly', 'Trebbiano', 'Malvas...",1.0,,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Set features. This will also be used as your x values.
merge_df = merge_df.drop(["description_split"], axis=1)
merge_df['White'] = merge_df['White'].fillna(0)
merge_df['Red'] = merge_df['Red'].fillna(0)


merge_df.to_csv("Resources/Filtered_ML_matrix_forVar.csv", index=False)

merge_df.head()

In [None]:
X = merge_df.drop(["variety"], axis=1)
y = merge_df["variety"]

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

X_train.head()

In [None]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [15]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [21]:
merge_df['White'] = merge_df['White'].map({1: 'White', 0: 'Red'})
merge_df = merge_df.rename(columns={'White': 'Category'})

del merge_df['Red']


In [29]:
merge_df = pd.get_dummies(merge_df, columns=['variety'], prefix=None)

merge_df.to_csv("Resources/Filtered_ML_matrix_forCat.csv", index=False)


In [32]:
X = merge_df.drop(["Category"], axis=1)
y = merge_df["Category"]

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

X_train.head()

Unnamed: 0,points,ripe,crisp,mature,tropical,rich,sweet,herbal,full-bodied,fresh,...,variety_Verdejo,variety_Verdicchio,variety_Vermentino,variety_Vernaccia,variety_Vidal Blanc,variety_Vilana,variety_Viognier,variety_Viura,variety_White Blend,variety_Zinfandel
23984,87,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12897,86,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
27795,87,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27344,92,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
64132,87,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [34]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [35]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9994789939847487

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")