# Imports

In [18]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from category_encoders import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

# Read in the data

In [19]:
df = pd.read_csv('psuedo_ohe.csv')
df.head()

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Hungry,Energetic,None.1,Aroused,Euphoric,Dry Mouth,Tingly,Type,Description,Strain
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,hybrid,$100 OG is a 50/50 hybrid strain that packs a ...,100-Og
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,hybrid,The ‘98 Aloha White Widow is an especially pot...,98-White-Widow
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,sativa,1024 is a sativa-dominant hybrid bred in Spain...,1024
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,hybrid,13 Dawgs is a hybrid of G13 and Chemdawg genet...,13-Dawgs
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,hybrid,"Also known as Kosher Tangie, 24k Gold is a 60%...",24K-Gold


In [20]:
df.isnull().sum()

Pepper          0
Spicy/Herbal    0
Pine            0
Grapefruit      0
Apricot         0
               ..
Dry Mouth       0
Tingly          0
Type            0
Description     0
Strain          0
Length: 68, dtype: int64

# Encode/Clean

In [21]:
df.Type.replace({'hybrid': 0, 'sativa': 1, 'indica':2}, inplace=True)
df.head(1)

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Hungry,Energetic,None.1,Aroused,Euphoric,Dry Mouth,Tingly,Type,Description,Strain
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,$100 OG is a 50/50 hybrid strain that packs a ...,100-Og


In [22]:
df.drop_duplicates('Strain', inplace=True)
df.shape

(2204, 68)

In [32]:
dtm = tfidf.fit_transform(df.Description)

In [35]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_df=.97,
    min_df=3,
)
dtm = tfidf.fit_transform(df['Description'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
dtm.head()

Unnamed: 0,09,10,100,11,12,13,14,15,16,17,...,yellow,yield,yielders,yielding,yields,zealand,zest,zestful,zesty,zion
0,0.0,0.0,0.442511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.567069,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Clean base df
df.drop('Description', axis=1, inplace=True)
df.head()

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Creative,Hungry,Energetic,None.1,Aroused,Euphoric,Dry Mouth,Tingly,Type,Strain
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,100-Og
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,98-White-Widow
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1024
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,13-Dawgs
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,24K-Gold


In [37]:
print(df.shape, dtm.shape)
final_df = pd.concat([df, dtm], axis=1)
print(final_df.shape)
final_df.head()

(2204, 67) (2204, 3461)
(2205, 3528)


Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,yellow,yield,yielders,yielding,yields,zealand,zest,zestful,zesty,zion
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
final_df.dropna(inplace=True)

In [42]:
nn = NearestNeighbors(n_neighbors=4, n_jobs=-1, algorithm='kd_tree')
nn.fit(final_df.drop('Strain', axis=1));

# What wee need for the api

In [154]:
# you need to read in the data as df in the top
# and unpickle the nn model and tfidf

In [43]:
effect = {'Aroused','Creative','Dry Mouth',
           'Energetic','Euphoric','Focused',
           'Giggly','Happy','Hungry',
           'None','Relaxed','Sleepy',
           'Talkative','Tingly','Uplifted'}

flavor = {'Ammonia','Apple','Apricot','Berry',
          'Blue','Blueberry','Butter','Cheese',
          'Chemical','Chestnut','Citrus','Coffee',
          'Diesel','Earthy','Flowery','Fruit','Grape',
          'Grapefruit','Honey','Lavender','Lemon',
          'Lime','Mango','Menthol','Mint','Minty',
          'None','Nutty','Orange','Peach','Pear','Pepper',
          'Pine','Pineapple','Plum','Pungent','Rose','Sage',
          'Skunk','Spicy/Herbal','Strawberry','Sweet','Tar',
          'Tea','Tobacco','Tree','Tropical','Vanilla','Violet','Woody'}

In [152]:
def predict(effects: list, flavors: list, type_: str, description: str):
    types = {'hybrid': 0, 'sativa': 1, 'indica':2}
    
    temp = pd.DataFrame(data=[[0] * 66],columns = ['Pepper', 'Spicy/Herbal', 'Pine', 'Grapefruit', 'Apricot', 'Peach',
                                                   'Mint', 'Tea', 'Tar', 'Cheese', 'Vanilla', 'None', 'Minty', 'Diesel',
                                                   'Woody', 'Citrus', 'Sage', 'Ammonia', 'Fruit', 'Violet', 'Skunk',
                                                   'Butter', 'Flowery', 'Blueberry', 'Rose', 'Pineapple', 'Pear', 'Lime',
                                                   'Strawberry', 'Coffee', 'Berry', 'Sweet', 'Earthy', 'Nutty', 'Blue',
                                                   'Chemical', 'Pungent', 'Orange', 'Plum', 'Tropical', 'Apple', 'Tobacco',
                                                   'Honey', 'Chestnut', 'Mango', 'Menthol', 'Lemon', 'Tree', 'Grape',
                                                   'Lavender', 'Focused', 'Giggly', 'Sleepy', 'Uplifted', 'Relaxed',
                                                   'Happy', 'Talkative', 'Creative', 'Hungry', 'Energetic', 'None.1',
                                                   'Aroused', 'Euphoric', 'Dry Mouth', 'Tingly', 'Type'])
    for i in effects:
        if i.capitalize() in effect:
            temp[i][0] = 1
    
    for i in flavors:
        if i.capitalize() in flavor:
            temp[i][0] = 1
            
    temp['Type'] = types[type_.lower()]
    dtm = pd.DataFrame(np.array(tfidf.transform(['This is a test']).todense()[0]))
    temp = pd.concat([temp, dtm], axis=1)
    neighbors = nn.kneighbors([temp.iloc[0]])[1][0]
    return {"predictions": [df.iloc[i]['Strain'] for i in neighbors]}

In [153]:
predict(['Aroused', 'Creative', 'Dry Mouth'], ['Pepper'], 'sativa', 'Make"s me hyper and aroused')

['Blueberry-Trainwreck', 'Boggle-Gum', 'Proper-Pho-Shatter-S', 'Velvet-Bud']

# Serialization

In [155]:
import pickle

In [157]:
with open('vect', 'wb') as file:
    pickle.dump(tfidf, file)
with open('nearest',  'wb') as file:
    pickle.dump(nn, file)