# ML Component
kNN and k-means will be used

In [1]:
import pandas as pd
import numpy as np
import csv
from nltk.tokenize import TreebankWordTokenizer 

In [2]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
seasons = pd.read_csv('4300/categories-only_seasons.csv')
fixed = pd.read_csv('4300/fixed_dataset.csv')
temperature = pd.read_csv('4300/dataset_newest.csv')

## Fixed dataset details

In [4]:
fixed.columns   

Index(['base_spirits', 'name', 'url', 'name_words', 'description', 'image',
       'ingredients', 'Length', 'url.1', 'num_reviews', 'rating',
       'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')

In [5]:
fixed.head(3)

Unnamed: 0,base_spirits,name,url,name_words,description,image,ingredients,Length,url.1,num_reviews,rating,Unnamed: 11,Unnamed: 12
0,['gin'],50-50 martini,https://www.thespruceeats.com/50-50-gin-martin...,"['50-50', 'martini']",The 50-50 martini is the mix for those who emb...,https://www.thespruceeats.com/thmb/PNxhU3pehj-...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...",3,https://www.thespruceeats.com/50-50-gin-martin...,24,4.4,,
1,['gin'],abbey cocktail,https://www.thespruceeats.com/abbey-cocktail-r...,"['abbey', 'cocktail']",The Abbey Cocktail makes an excellent brunch c...,https://www.thespruceeats.com/thmb/gc7MpJ85PeA...,"['2 ounces gin', '1 1/2 ounces orange juice', ...",4,https://www.thespruceeats.com/abbey-cocktail-r...,14,3.9,,
2,['absinthe'],absinthe cocktail,https://www.thespruceeats.com/absinthe-cocktai...,"['absinthe', 'cocktail']",This absinthe cocktail is truly a classic cock...,https://www.thespruceeats.com/thmb/dkkcc3vR008...,"['1 ounce absinthe', '1 ounce ice water (cold)...",4,https://www.thespruceeats.com/absinthe-cocktai...,7,4.4,,


In [6]:
# remove col 8,11,12
fixed = fixed.drop(columns = ['url.1', 'Unnamed: 11', 'Unnamed: 12'])

In [7]:
fixed.columns

Index(['base_spirits', 'name', 'url', 'name_words', 'description', 'image',
       'ingredients', 'Length', 'num_reviews', 'rating'],
      dtype='object')

In [8]:
fixed.head(3)

Unnamed: 0,base_spirits,name,url,name_words,description,image,ingredients,Length,num_reviews,rating
0,['gin'],50-50 martini,https://www.thespruceeats.com/50-50-gin-martin...,"['50-50', 'martini']",The 50-50 martini is the mix for those who emb...,https://www.thespruceeats.com/thmb/PNxhU3pehj-...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...",3,24,4.4
1,['gin'],abbey cocktail,https://www.thespruceeats.com/abbey-cocktail-r...,"['abbey', 'cocktail']",The Abbey Cocktail makes an excellent brunch c...,https://www.thespruceeats.com/thmb/gc7MpJ85PeA...,"['2 ounces gin', '1 1/2 ounces orange juice', ...",4,14,3.9
2,['absinthe'],absinthe cocktail,https://www.thespruceeats.com/absinthe-cocktai...,"['absinthe', 'cocktail']",This absinthe cocktail is truly a classic cock...,https://www.thespruceeats.com/thmb/dkkcc3vR008...,"['1 ounce absinthe', '1 ounce ice water (cold)...",4,7,4.4


# Categories/Seaons dataset details

In [9]:
seasons.columns  

Index(['Unnamed: 0', 'url', 'categories', 'season'], dtype='object')

In [10]:
seasons.head(3)

Unnamed: 0.1,Unnamed: 0,url,categories,season
0,0,https://www.thespruceeats.com/50-50-gin-martin...,"['Gin Cocktails', 'American Food']",
1,1,https://www.thespruceeats.com/abbey-cocktail-r...,"['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",
2,2,https://www.thespruceeats.com/absinthe-cocktai...,"['Cocktail Basics', 'Cocktails', 'American Food']",


In [11]:
# remove col 1
seasons = seasons.drop(['Unnamed: 0'], axis=1)

In [12]:
seasons.head(3)

Unnamed: 0,url,categories,season
0,https://www.thespruceeats.com/50-50-gin-martin...,"['Gin Cocktails', 'American Food']",
1,https://www.thespruceeats.com/abbey-cocktail-r...,"['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",
2,https://www.thespruceeats.com/absinthe-cocktai...,"['Cocktail Basics', 'Cocktails', 'American Food']",


# Temperature dataset details

In [13]:
temperature.columns

Index(['Unnamed: 0', 'base_spirits', 'name', 'name_words', 'description',
       'image', 'ingredients', 'url.1', 'num_reviews', 'rating', 'categories',
       'Iced', 'Hot'],
      dtype='object')

In [14]:
temperature.head(3)

Unnamed: 0.1,Unnamed: 0,base_spirits,name,name_words,description,image,ingredients,url.1,num_reviews,rating,categories,Iced,Hot
0,0,['gin'],50-50 martini,"['50-50', 'martini']",The 50-50 martini is the mix for those who emb...,https://www.thespruceeats.com/thmb/PNxhU3pehj-...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...",https://www.thespruceeats.com/50-50-gin-martin...,24,4.4,"['Gin Cocktails', 'American Food']",False,False
1,1,['gin'],abbey cocktail,"['abbey', 'cocktail']",The Abbey Cocktail makes an excellent brunch c...,https://www.thespruceeats.com/thmb/gc7MpJ85PeA...,"['2 ounces gin', '1 1/2 ounces orange juice', ...",https://www.thespruceeats.com/abbey-cocktail-r...,14,3.9,"['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",False,False
2,2,['absinthe'],absinthe cocktail,"['absinthe', 'cocktail']",This absinthe cocktail is truly a classic cock...,https://www.thespruceeats.com/thmb/dkkcc3vR008...,"['1 ounce absinthe', '1 ounce ice water (cold)...",https://www.thespruceeats.com/absinthe-cocktai...,7,4.4,"['Cocktail Basics', 'Cocktails', 'American Food']",True,False


In [15]:
# grab only the url.1, Iced, Hot columns
temperature = pd.DataFrame(temperature, columns = ['url.1', 'Iced', 'Hot'])

#rename url.1 to url
temperature.rename(columns = {'url.1':'url'}, inplace = True)

In [16]:
temperature.head(3)

Unnamed: 0,url,Iced,Hot
0,https://www.thespruceeats.com/50-50-gin-martin...,False,False
1,https://www.thespruceeats.com/abbey-cocktail-r...,False,False
2,https://www.thespruceeats.com/absinthe-cocktai...,True,False


# Join the three datasets on url 

In [18]:
# first merge seasons with fixed
result = fixed.merge(seasons, how="inner", on="url")
# result = result.drop(columns = ['Unnamed: 0'])
result.head(3)

Unnamed: 0,base_spirits,name,url,name_words,description,image,ingredients,Length,num_reviews,rating,categories,season
0,['gin'],50-50 martini,https://www.thespruceeats.com/50-50-gin-martin...,"['50-50', 'martini']",The 50-50 martini is the mix for those who emb...,https://www.thespruceeats.com/thmb/PNxhU3pehj-...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...",3,24,4.4,"['Gin Cocktails', 'American Food']",
1,['gin'],abbey cocktail,https://www.thespruceeats.com/abbey-cocktail-r...,"['abbey', 'cocktail']",The Abbey Cocktail makes an excellent brunch c...,https://www.thespruceeats.com/thmb/gc7MpJ85PeA...,"['2 ounces gin', '1 1/2 ounces orange juice', ...",4,14,3.9,"['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",
2,['absinthe'],absinthe cocktail,https://www.thespruceeats.com/absinthe-cocktai...,"['absinthe', 'cocktail']",This absinthe cocktail is truly a classic cock...,https://www.thespruceeats.com/thmb/dkkcc3vR008...,"['1 ounce absinthe', '1 ounce ice water (cold)...",4,7,4.4,"['Cocktail Basics', 'Cocktails', 'American Food']",


In [19]:
# merge result with temperature
result = result.merge(temperature, how="inner", on="url")
result.head(3)

Unnamed: 0,base_spirits,name,url,name_words,description,image,ingredients,Length,num_reviews,rating,categories,season,Iced,Hot
0,['gin'],50-50 martini,https://www.thespruceeats.com/50-50-gin-martin...,"['50-50', 'martini']",The 50-50 martini is the mix for those who emb...,https://www.thespruceeats.com/thmb/PNxhU3pehj-...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...",3,24,4.4,"['Gin Cocktails', 'American Food']",,False,False
1,['gin'],abbey cocktail,https://www.thespruceeats.com/abbey-cocktail-r...,"['abbey', 'cocktail']",The Abbey Cocktail makes an excellent brunch c...,https://www.thespruceeats.com/thmb/gc7MpJ85PeA...,"['2 ounces gin', '1 1/2 ounces orange juice', ...",4,14,3.9,"['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",,False,False
2,['absinthe'],absinthe cocktail,https://www.thespruceeats.com/absinthe-cocktai...,"['absinthe', 'cocktail']",This absinthe cocktail is truly a classic cock...,https://www.thespruceeats.com/thmb/dkkcc3vR008...,"['1 ounce absinthe', '1 ounce ice water (cold)...",4,7,4.4,"['Cocktail Basics', 'Cocktails', 'American Food']",,True,False


In [20]:
# count how many drinks have seasons and how many don't have seasons
noseason = 0
summer = []
winter = []
spring = []
fall = []
for idx,i in enumerate(result['season']):
  if (pd.isna(i)):
    noseason+=1
  else:
    # clean up the categories column 
    x = result['categories'][idx]
    x = x[1:-1]
    x = x.replace("'", "")
    x = x.split(", ")
    if i=='summer':
      for s in x:
        summer.append(s)
    if i=='winter':
      for s in x:
        winter.append(s)
    if i=='spring':
      for s in x:
        spring.append(s)
    if i=='fall':
      for s in x:
        fall.append(s)
    
print(noseason, (len(result['season']) - noseason))
print(summer)
print(winter)
print(spring)
print(fall)

621 135
['Gin Cocktails', 'Cocktails', 'July 4th Recipes', 'July 4th Drinks', 'Rum Cocktails', 'American Food', 'Banana Recipes', 'Citrus Recipes', 'Summer Cocktails', 'Drinks & Cocktails', '"Fathers Day Recipes"', 'Cocktails', 'Whiskey Cocktails', 'BBQ Food', 'Rum Cocktails', 'American Food', 'Fruit Recipes', 'Pineapple Recipes', 'Summer Cocktails', 'Liqueurs', 'Vodka Cocktails', 'Summer Cocktails', 'July 4th Drinks', 'Vodka Cocktails', 'American Food', 'Pineapple Recipes', 'July 4th Drinks', 'Liqueurs', 'Beer', 'Citrus Recipes', 'Summer Cocktails', 'Tequila Cocktails', 'Mexican Food', 'Pineapple Recipes', 'Summer Cocktails', 'Vodka Cocktails', 'Citrus Recipes', 'Summer Cocktails', 'Rum Cocktails', 'Caribbean Food', 'Citrus Recipes', 'Pineapple Recipes', 'Summer Cocktails', 'Liqueurs', 'Rum Cocktails', 'Citrus Recipes', 'Summer Cocktails', 'Vodka Cocktails', 'American Food', 'Summer Cocktails', 'Rum Cocktails', 'July 4th Recipes', 'July 4th Drinks', 'Summer Cocktails', 'Drinks & Cockt

# One hot encoding

In [21]:
matrix = pd.DataFrame(result, columns = ['name', 'description', 'ingredients', 'categories', 'season', 'Iced', 'Hot'])
matrix.head(3)

Unnamed: 0,name,description,ingredients,categories,season,Iced,Hot
0,50-50 martini,The 50-50 martini is the mix for those who emb...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...","['Gin Cocktails', 'American Food']",,False,False
1,abbey cocktail,The Abbey Cocktail makes an excellent brunch c...,"['2 ounces gin', '1 1/2 ounces orange juice', ...","['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",,False,False
2,absinthe cocktail,This absinthe cocktail is truly a classic cock...,"['1 ounce absinthe', '1 ounce ice water (cold)...","['Cocktail Basics', 'Cocktails', 'American Food']",,True,False


In [27]:
# one-hot encode temperature - 1 if iced, 0 if not  
temp = pd.DataFrame(0, index=np.arange(len(matrix['Iced'])), columns=["iced", "hot"])
temp.head()
for idx,i in enumerate(matrix['Iced']):
  if i==True: 
    temp['iced'][idx] = 1
  else:
    temp['hot'][idx] = 1
temp.head()

Unnamed: 0,iced,hot
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1


In [28]:
# intialize tokenizer
# print(matrix['season'][101:110])
treebank_tokenizer = TreebankWordTokenizer()
tokenizer=treebank_tokenizer

In [29]:
# one-hot encode top 5 ingredients from each season
summer_ing = []
winter_ing = []
spring_ing = []
fall_ing = []

for idx,i in enumerate(matrix["season"]):
  if pd.notna(i):   # has a season
    # tokenize ingredients
    x = matrix['ingredients'][idx]
    x = x[1:-1]
    x = x.replace("'", "")
    x = tokenizer.tokenize(x)

    if i=='summer':
      for w in x:
        if w.isalpha():
          summer_ing.append(w.lower())

    if i=='winter':
      for w in x:
        if w.isalpha():
          winter_ing.append(w.lower())
    if i=='spring':
      for w in x:
        if w.isalpha():
          spring_ing.append(w.lower())

    if i=='fall':
      for w in x:
        if w.isalpha():
          fall_ing.append(w.lower())

# find the top 5 most frequent ingredients for each season, excluding stop words
summer_count = Counter(summer_ing)
# display(summer_count.most_common(15))
print('5 most common summer ingredients are: lime, orange, rum, pineapple, lemon')

winter_count = Counter(winter_ing)
# display(winter_count.most_common(15))
print('5 most common winter ingredients are: orange, cinnamon, sugar, nutmeg, hot')

spring_count = Counter(spring_ing)
# display(spring_count.most_common(20))
print('5 most common spring ingredients are: lime, cream, pepper, bitters, lemon, irish')

fall_count = Counter(fall_ing)
# display(fall_count.most_common(15))
print('5 most common fall ingredients are: cinnamon, pumpkin, apple, orange, lemon')

5 most common summer ingredients are: lime, orange, rum, pineapple, lemon
5 most common winter ingredients are: orange, cinnamon, sugar, nutmeg, hot
5 most common spring ingredients are: lime, cream, pepper, bitters, lemon, irish
5 most common fall ingredients are: cinnamon, pumpkin, apple, orange, lemon


In [60]:
# the most frequent ingredients for drinks with seasons
ingr_list = ["lime", "orange", "rum", "pineapple", "lemon", "cinnamon", "sugar", "nutmeg", "hot", "cream", "pepper", "bitters", "irish", "pumpkin", "apple"]

# one-hot encode temperature - 1 if it has the ingredient, 0 if not  
tok_ingr = []
for idx,i in enumerate(matrix['ingredients']):
    x = i
    x = x[1:-1]
    x = x.replace("'", "")
    x = tokenizer.tokenize(x)
    tok_ingr.append([w.lower() for w in x if w.isalpha()])

ingr = pd.DataFrame(0, index=np.arange(len(matrix['ingredients'])), columns=ingr_list)
ingr.head()

for i in ingr_list:
  for idx,tok_list in enumerate(tok_ingr):
    if i in tok_list:
      # print(i)
      ingr[i][idx] = 1

ingr.head(3)

Unnamed: 0,lime,orange,rum,pineapple,lemon,cinnamon,sugar,nutmeg,hot,cream,pepper,bitters,irish,pumpkin,apple
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [97]:
# one-hot encode season - 1 if it has that season, 0 if not
seas = pd.DataFrame(0, index=np.arange(len(matrix['season'])), columns=["summer", "winter", "spring", "fall"])
for idx,i in enumerate(matrix['season']):
  if pd.notna(i):
    if i=='summer':
      seas['summer'][idx] = 1
    elif i=='winter':
      seas['winter'][idx] = 1
    elif i=='spring':
      seas['spring'][idx] = 1
    elif i=='fall':
      seas['fall'][idx] = 1
seas.head()

Unnamed: 0,summer,winter,spring,fall
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [137]:
# one-hot encode top 10 words from all descriptions

# get a list of all description words
dir_words = []
for dir in matrix['description']:
  dir_tok = tokenizer.tokenize(dir.lower())
  for w in dir_tok:
    if w.isalpha:
      dir_words.append(w)

# find the 10 most common words used in all descriptions
dir_counter = Counter(dir_words)

# the most frequent ingredients for drinks with seasons
top_dir = ["simple", "easy", "sweet", "classic", "fresh", "fantastic", "refreshing", "sour", "ginger", "fruit"]

# one-hot encode description words - 1 if it has, 0 if not  
tok_dir = []
for idx,i in enumerate(matrix['description']):
    dir_tok = tokenizer.tokenize(i.lower())
    tok_dir.append([w for w in dir_tok if w.isalpha()])
tok_dir

dir = pd.DataFrame(0, index=np.arange(len(matrix['description'])), columns=top_dir)
dir.head()

for i in top_dir:
  for idx,tok_list in enumerate(tok_dir):
    if i in tok_list:
      dir[i][idx] = 1

dir.head()

Unnamed: 0,simple,easy,sweet,classic,fresh,fantastic,refreshing,sour,ginger,fruit
0,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0


# Create the feature matrix

In [141]:
# merge two one-hot-encoded temp df and ingr df and seas df and dir df
df = pd.concat([matrix['name'], dir, ingr, temp, seas, matrix['season']], axis=1)
df

Unnamed: 0,name,simple,easy,sweet,classic,fresh,fantastic,refreshing,sour,ginger,fruit,lime,orange,rum,pineapple,lemon,cinnamon,sugar,nutmeg,hot,cream,pepper,bitters,irish,pumpkin,apple,iced,hot.1,summer,winter,spring,fall,season
0,50-50 martini,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
1,abbey cocktail,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,
2,absinthe cocktail,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,
3,accomplice,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
4,adonis,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,yellow submarine,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
752,zesty irishman,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
753,zombie,0,0,0,1,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,
754,zombie punch,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,fall


# K-means - new version

In [142]:
# get the X_train (drinks with season)
X_train_idx = []
X_test_idx = []

for idx,i in enumerate(df["season"]):
  if pd.notna(i): 
    X_train_idx.append(idx)
  else:
    X_test_idx.append(idx)


X_train = pd.DataFrame(df, index=X_train_idx)
X_test = pd.DataFrame(df, index=X_test_idx)

X_train.head()

Unnamed: 0,name,simple,easy,sweet,classic,fresh,fantastic,refreshing,sour,ginger,fruit,lime,orange,rum,pineapple,lemon,cinnamon,sugar,nutmeg,hot,cream,pepper,bitters,irish,pumpkin,apple,iced,hot.1,summer,winter,spring,fall,season
10,almost a collins,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,winter
12,american collins,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,summer
13,american dream,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,summer'
19,apple cider martini,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,fall
27,april rain,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,spring


In [143]:
# initiate kmeans with 4 clusters
kmeans =  KMeans(n_clusters = 4)
kmeans.fit(X_train.iloc[:,1:-1])   # don't train on the first and last columns since those are name and season

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [147]:
labels = kmeans.labels_    # 3 winter, 1 summer, 2 fall, 0 spring

In [148]:
predictions = kmeans.predict(X_test.iloc[:,1:-1])
predictions

array([1, 0, 2, 1, 2, 1, 1, 2, 3, 2, 3, 3, 3, 3, 1, 3, 0, 1, 3, 0, 1, 2,
       1, 2, 1, 1, 1, 3, 0, 3, 1, 0, 1, 2, 3, 0, 2, 1, 3, 3, 2, 1, 2, 3,
       2, 3, 2, 0, 2, 3, 2, 2, 3, 3, 3, 3, 2, 2, 0, 3, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 3, 2, 2, 3, 1, 3, 3, 1, 0, 2, 3, 0, 2, 3, 1, 3, 2, 2, 2,
       1, 2, 0, 1, 1, 1, 1, 1, 3, 1, 0, 0, 2, 2, 3, 1, 2, 1, 2, 1, 3, 1,
       0, 1, 2, 0, 2, 3, 3, 0, 2, 2, 2, 2, 3, 3, 3, 2, 3, 1, 2, 0, 3, 2,
       1, 2, 3, 1, 0, 2, 1, 1, 0, 2, 2, 0, 3, 1, 2, 3, 2, 2, 3, 1, 3, 2,
       2, 1, 3, 0, 2, 2, 2, 1, 3, 0, 3, 3, 3, 2, 2, 3, 3, 2, 2, 3, 1, 2,
       3, 3, 2, 2, 1, 2, 2, 2, 1, 2, 3, 2, 0, 1, 0, 2, 2, 1, 0, 3, 3, 1,
       3, 3, 2, 3, 3, 1, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 1, 2, 1, 1,
       2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 3, 3, 2, 2, 1, 0, 2,
       3, 3, 1, 0, 2, 2, 0, 2, 2, 1, 3, 2, 3, 1, 3, 2, 2, 0, 2, 3, 2, 1,
       2, 2, 3, 1, 2, 3, 0, 0, 3, 3, 3, 2, 2, 2, 3, 3, 1, 2, 0, 0, 0, 1,
       3, 3, 2, 0, 0, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1,

In [150]:
matrix.head()

Unnamed: 0,name,description,ingredients,categories,season,Iced,Hot
0,50-50 martini,The 50-50 martini is the mix for those who emb...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...","['Gin Cocktails', 'American Food']",,False,False
1,abbey cocktail,The Abbey Cocktail makes an excellent brunch c...,"['2 ounces gin', '1 1/2 ounces orange juice', ...","['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",,False,False
2,absinthe cocktail,This absinthe cocktail is truly a classic cock...,"['1 ounce absinthe', '1 ounce ice water (cold)...","['Cocktail Basics', 'Cocktails', 'American Food']",,True,False
3,accomplice,Combine a fine Champagne and a great vodka wit...,"['3 strawberries (sliced)', '1/2 ounce simple ...","['Vodka Cocktails', 'American Food', 'Berry Re...",,False,False
4,adonis,The Adonis is one of the classic cocktails tha...,"['1-ounce dry sherry', '1/2 ounce sweet vermou...","['Liqueurs', 'Cocktails', 'American Food']",,False,False


In [177]:
season_dic = {0: 'spring', 1: 'summer', 2: 'fall', 3:'winter'}
X_test_names = [i for i in X_test['name']]

predictions_labels = {}

for idx,i in enumerate(predictions):
  predictions_labels[X_test_names[idx]] = season_dic[i]

names = [i for i in matrix['name']]
names_dic = {}
for idx,name in enumerate(names):
  names_dic[name] = idx
names_dic

# add the predicted seasons back into the original matrix df
for name,season in predictions_labels.items():
  # get index 
  index = names_dic[name]
  # update seasons field in matrix with season
  # print(matrix["season"][idx])
  matrix["season"][index] = season

matrix.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,name,description,ingredients,categories,season,Iced,Hot
0,50-50 martini,The 50-50 martini is the mix for those who emb...,"['2 ounces gin', '2 ounces dry vermouth', 'Gar...","['Gin Cocktails', 'American Food']",summer,False,False
1,abbey cocktail,The Abbey Cocktail makes an excellent brunch c...,"['2 ounces gin', '1 1/2 ounces orange juice', ...","['Gin Cocktails', 'Breakfast & Brunch', 'Ameri...",spring,False,False
2,absinthe cocktail,This absinthe cocktail is truly a classic cock...,"['1 ounce absinthe', '1 ounce ice water (cold)...","['Cocktail Basics', 'Cocktails', 'American Food']",fall,True,False
3,accomplice,Combine a fine Champagne and a great vodka wit...,"['3 strawberries (sliced)', '1/2 ounce simple ...","['Vodka Cocktails', 'American Food', 'Berry Re...",summer,False,False
4,adonis,The Adonis is one of the classic cocktails tha...,"['1-ounce dry sherry', '1/2 ounce sweet vermou...","['Liqueurs', 'Cocktails', 'American Food']",fall,False,False


# Update dataset_new with predicted seasons to create a final dataset called final.csv

In [179]:
predicted_seasons_df = pd.DataFrame(matrix, columns = ['name', 'season'])
# predicted_seasons_df.to_csv('predicted_seasons.csv', index = True)

In [188]:
pred_seasons = pd.read_csv('4300/predicted_seasons.csv')
pred_seasons = pred_seasons.drop(columns = ['Unnamed: 0'])

old_df = pd.read_csv('4300/dataset_newest.csv')
old_df = old_df.drop(columns = ['Unnamed: 0'])

final_df = old_df.merge(pred_seasons, how="inner", on="name")
# final_df.to_csv('final.csv', index = True)

#K-means - old version

In [None]:
# use k-means on categories

sumW = [w.lower() for w in summer] 
winW = [w.lower() for w in winter] 
sprW = [w.lower() for w in spring] 
faW = [w.lower() for w in fall] 

print(len(sumW), len(winW), len(sprW), len(faW))

In [None]:
# create features list with top 50 words from each of the four seasons
fdist = Counter(sumW + winW + sprW + faW) # for all seasons
top50,_ = zip(*fdist.most_common(50)) # top 50 from combined set
print (top50)

In [None]:
# create M for all authors
M = []
for corp in [sumW, winW, sprW, faW]:
    for i in range(0,len(corp)-9,10):  #14, #17, #7, #14
        fdist = Counter(corp[i:i+10])
        M.append([fdist[w] for w in top50])
M = np.array(M).astype(np.float64)

In [None]:
len(M[0]), len(M)

In [None]:
def plot_format(ylab=''):
    plt.xticks(range(50), top50, rotation=270)
    plt.xlim(-1,50)
    plt.grid(axis='x', linestyle='dotted')
    plt.ylabel(ylab)
    plt.legend()

plt.figure(figsize=(12,9))

# plot of 61 lines
plt.subplot(211)
plt.plot(M[:14].T, 'C0') # summer in blue
plt.plot(M[14:31].T, 'C1') # winter in orange
plt.plot(M[31:38].T, 'C2') # spring in green
plt.plot(M[38:].T, 'C3') # fall in red

# legend
plt.plot([-1],[4],label='Summer (14)') 
plt.plot([-1],[4],label='Winter (17)')      
plt.plot([-1],[4],label='Spring (7)') 
plt.plot([-1],[4],label='Fall (14)')  
plot_format('counts per 10 word block')

# plot of averages
plt.subplot(212)
plt.plot(M[:14].mean(0), '.-', label='Summer') # summer in blue
plt.plot(M[14:31].mean(0), '.-', label='Winter') # winter in orange
plt.plot(M[31:38].mean(0), '.-', label='Spring') # spring in green
plt.plot(M[38:].mean(0), '.-', label='Fall') # fall in red
plot_format('means')

In [None]:
# initialize scalar and create M_scaled array
scaler = preprocessing.StandardScaler()
M_scaled = scaler.fit_transform(M)

In [None]:
# dimensionally reduce to 3 dimensions
pca = PCA(n_components=3)  
M_new = pca.fit_transform(M_scaled)
evr = pca.explained_variance_ratio_
print ('explained variance ratio:', evr, sum(evr)) 

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
x = []
y = []
z = []

for i in M_new:
    x.append(i[0])
    y.append(i[1])
    z.append(i[2])

In [None]:
fig=plt.figure(figsize=(6,6))
ax = fig.add_subplot(111, projection='3d', elev=30, azim=-60)

ax.scatter(x[:14], y[:14], z[:14], color='blue') # summer in blue
ax.scatter(x[14:31], y[14:31], z[14:31], color='orange') # winter in orange
ax.scatter(x[31:38], y[31:38], z[31:38], color='green') # spring in green
ax.scatter(x[38:], y[38:], z[38:], color='red') # fall in red

In [None]:
# K-means
# initialize km4 to find 4 clusters
km4=KMeans(4).fit(M_new)
means4=km4.cluster_centers_

In [None]:
means4

In [None]:
noSW = pd.DataFrame(result, columns = ['name', 'categories', 'season'])
season_idx = [] # list of indexes for drinks with seasons already 

# fix categories so they are in list form 
for idx,i in enumerate(noSW['categories']):
    if pd.isna(result['season'][idx]):   
        x = i
        x = x[1:-1]
        x = x.replace("'", "")
        x = x.split(", ")
        noSW['categories'][idx] = x
    else:
        season_idx.append(idx)
print(len(season_idx))

In [None]:
# test 1 - on drinks with no season 
# get categories of drinks with no season
noS = noSW[noSW["season"].isna()]
print(noS.shape)
noS.head()

In [None]:
noS['categories'][0]

In [None]:
noSW = []
for words in noS['categories']:
  for w in words:
    noSW.append(w.lower())

test = []

for corp in [noSW]:
    # 200 more blocks of 10 words
    for i in range(0,len(corp)-9,10):  
        fdist = Counter(corp[i:i+10])
        # print(fdist)
        test.append([fdist[w] for w in top50])
test = np.array(test).astype(np.float64)

In [None]:
len(test), test[0]

In [None]:
test = pca.transform(scaler.transform(test))
print(km4.predict(test)) 

In [None]:
noSW[:10], km4.predict(test)[:10]

In [None]:
x2 = []
y2 = []
z2 = []

for i in test:
    x2.append(i[0])
    y2.append(i[1])
    z2.append(i[2])

In [None]:
# redrawn plot with additional datapoints
fig=plt.figure(figsize=(6,6))
ax = fig.add_subplot(111, projection='3d', elev=30, azim=-60)

# original data points
ax.scatter(x[:14], y[:14], z[:14], color='blue') # summer in blue
ax.scatter(x[14:31], y[14:31], z[14:31], color='orange') # winter in orange
ax.scatter(x[31:38], y[31:38], z[31:38], color='green') # spring in green
ax.scatter(x[38:], y[38:], z[38:], color='red') # fall in red

# added data points
ax.scatter(x2[:], y2[:], z2[:], color='grey') # new in grey


In [None]:
# centroid for each season cluster
means4

In [None]:
# find the distance between each grey dot and cluster center
distance = {0: [], 1: [], 2: [], 3: []}
for idx,i in enumerate(test):
  d1 = np.sqrt( (means4[0][0]-i[0])**2 + (means4[0][1]-i[1])**2 + (means4[0][2]-i[2])**2 )  # summer centroid
  d2 = np.sqrt( (means4[1][0]-i[0])**2 + (means4[1][1]-i[1])**2 + (means4[1][2]-i[2])**2 )  # winter centroid
  d3 = np.sqrt( (means4[2][0]-i[0])**2 + (means4[2][1]-i[1])**2 + (means4[2][2]-i[2])**2 )  # spring centroid
  d4 = np.sqrt( (means4[3][0]-i[0])**2 + (means4[3][1]-i[1])**2 + (means4[3][2]-i[2])**2 )  # fall centroid
  d_score = np.array([d1,d2,d3,d4])
  min = np.min(d_score)
  label = np.where(d_score == min)[0]
  # print(label, label[0][0])
  distance[label[0]].append( (idx, min) )

print(len(distance[0]), distance[0])
print(len(distance[1]), distance[1])
print(len(distance[2]), distance[2])
print(len(distance[3]), distance[3])

In [None]:
# sort key based on min value, assign season for any with a distance less than threshold value
threshold = 1.5
filtered = {0: [], 1: [], 2: [], 3: []}

for i in distance:
  distance[i] = sorted(distance[i], key = lambda x: x[1])
  filtered[i] = list(filter(lambda ele: ele[1] <= threshold, distance[i]))

print(len(filtered[0]), filtered[0])
print(len(filtered[1]),filtered[1])
print(len(filtered[2]),filtered[2])
print(len(filtered[3]),filtered[3])

In [None]:
noS

In [None]:
for key,value in filtered.items():
  for s in value:
    print(s)
    print(test[s[0]])
    # index = s[0]
    # print(key, noS[index])