In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## sklearn basics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## Random Forest
from sklearn.ensemble import RandomForestClassifier

## knn
from sklearn.neighbors import KNeighborsClassifier

## pickle
import pickle

In [None]:
wine_class = pd.read_csv('Resources/Wine_varieties_classified.csv')
del wine_class['Unnamed: 0']
wine_class['Red'] = wine_class['is_red'] + 1

wine_class = wine_class[['variety', 'is_white', 'Red']]
wine_class.columns = ['variety', 'White', 'Red']

wine_class.fillna(0)

In [None]:
wines_to_keep = wine_class.variety.tolist()
wines_to_keep

In [None]:
full_df = pd.read_csv('Resources/Wine_Reviews_ML.csv')
full_df

In [None]:
subset = full_df.loc[full_df.variety.isin(wines_to_keep)]

subset

In [None]:
merge_df = subset.merge(wine_class, on='variety', suffixes = ("",""))

merge_df

In [None]:
adjective_list = ["ripe", "crisp", "mature", "tropical", "rich", "sweet", "herbal", "full-bodied", "fresh", "exotic",
                  "floral", "honeyed", "fruity", "smooth", "soft", "bright", "vibrant", "elegant", "dry", "earthy",
                  "rubbery", "tannic", "aromatic", "savory", "meaty", "vanilla", "candied", "toasted", "rare", 
                  "delicate", "smoky", "bitter", "ginger", "fragrant", "layered", "intense", "traditional", "nutty",
                  "balanced", "lemony", "citric", "rose", "well-known", "little-known"]

In [None]:
for item in adjective_list:
    merge_df[item] = 0

for item in adjective_list:
    for index, row in merge_df.iterrows():
        merge_df[item][index] += merge_df['description_split'][index].count(item)
        

In [None]:
del merge_df['description']

merge_df.to_csv('Resources/Filtered_ML.csv', index=False)

In [None]:
merge_df = pd.read_csv('Resources/Filtered_ML.csv')
merge_df.head()

In [None]:
merge_df = pd.get_dummies(merge_df, columns=['country','province'], prefix=None)

merge_df.head()

In [None]:
# Set features. This will also be used as your x values.
merge_df = merge_df.drop(["description_split"], axis=1)
merge_df['White'] = merge_df['White'].fillna(0)
merge_df['Red'] = merge_df['Red'].fillna(0)


merge_df.to_csv("Resources/Filtered_ML_matrix_forVar.csv", index=False)

In [None]:
merge_df = pd.read_csv("Resources/Filtered_ML_matrix_forVar.csv")

merge_df.head()

In [None]:
X = merge_df.drop(["variety"], axis=1)
y = merge_df["variety"]

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

X_train.head()

In [None]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
merge_df['White'] = merge_df['White'].map({1: 'White', 0: 'Red'})
merge_df = merge_df.rename(columns={'White': 'Category'})

del merge_df['Red']


In [None]:
merge_df = pd.get_dummies(merge_df, columns=['variety'], prefix=None)

merge_df.to_csv("Resources/Filtered_ML_matrix_forCat.csv", index=False)


In [None]:
merge_df = pd.read_csv("Resources/Filtered_ML_matrix_forCat.csv")

merge_df.head()

In [None]:
X = merge_df.drop(["Category"], axis=1)
y = merge_df["Category"]

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

X_train.head()

In [None]:
# Scale your data

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Save model

vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))

NBModel = 'sentiment_scoring.sklearn'
pickle.dump(clf, open(NBModel, 'wb'))

In [None]:
# Reusing model
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))
nbModel = pickle.load(open(NBModel, 'rb'))

user_input=['fruity','apple','dry','chocolate','red','italy']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = nbModel.predict(X_new)
print(result)