### Laymen Nearest Neighbors model (knn01_model)

Simple model used to learn interaction between different student tracks. 

In [2]:
#> Import statements
import pandas as pd
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pickle

In [3]:
data = pd.read_csv("cannabis_raw.csv")

print(data.shape)
data.head()

(2351, 6)


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
#> Cleaning data features and observations
data = data.rename(columns={'Strain': 'strain_name', 
                            'Type': 'strain_type', 
                            'Rating': 'strain_rating',
                            'Effects': 'effect_profile',
                            'Flavor': 'flavor_profile',
                            'Description': 'strain_description'})

In [5]:
data['strain_type'] = data['strain_type'].str.capitalize()

data['strain_type'].unique()

array(['Hybrid', 'Sativa', 'Indica'], dtype=object)

In [6]:
data['strain_rating'].unique()

array([4. , 4.7, 4.4, 4.2, 4.6, 0. , 4.5, 4.3, 5. , 3.8, 4.1, 4.8, 3.4,
       3.7, 3.9, 3. , 4.9, 3.6, 2.8, 3.3, 3.5, 3.2, 2. , 1. , 3.1, 2.5])

In [7]:
data = data.copy()

data = data.dropna()
data = data.reset_index(drop=True)

data.to_csv('cannabis_dropna')

In [8]:
data.shape

(2277, 6)

In [9]:
# Create a master profile feature
data['strain_profile'] = data['strain_type'] + ',' + data['effect_profile'] + ',' + data['flavor_profile']

In [10]:
# Vectorizer object
nlp=English()
tokenizer = Tokenizer(nlp.vocab)

tf = TfidfVectorizer(stop_words='english')

In [11]:
# Create a data-term matrix
dtm = tf.fit_transform(data['strain_profile'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
dtm.head()

Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,...,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.47909,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.358413,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.691452,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.161282,0.0,0.0,0.357784
3,0.0,0.0,0.650341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.290334,0.0,0.0,0.0,0.143473,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.211369,0.0,0.0,0.0


In [12]:
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [13]:
#> Testing model
profile_list = list(dtm.columns.values)
profile_list_array = np.array(profile_list)
profile_list_array

array(['ammonia', 'apple', 'apricot', 'aroused', 'berry', 'blue',
       'blueberry', 'butter', 'cheese', 'chemical', 'chestnut', 'citrus',
       'coffee', 'creative', 'diesel', 'dry', 'earthy', 'energetic',
       'euphoric', 'flowery', 'focused', 'fruit', 'giggly', 'grape',
       'grapefruit', 'happy', 'herbal', 'honey', 'hungry', 'hybrid',
       'indica', 'lavender', 'lemon', 'lime', 'mango', 'menthol', 'mint',
       'minty', 'mouth', 'nutty', 'orange', 'peach', 'pear', 'pepper',
       'pine', 'pineapple', 'plum', 'pungent', 'relaxed', 'rose', 'sage',
       'sativa', 'skunk', 'sleepy', 'spicy', 'strawberry', 'sweet',
       'talkative', 'tar', 'tea', 'tingly', 'tobacco', 'tree', 'tropical',
       'uplifted', 'vanilla', 'violet', 'woody'], dtype='<U10')

In [28]:
user01_input = ['peach, pear, aroused, sativa, indica']
user01_dense = tf.transform(user01_input)
_, user01_output = nn.kneighbors(user01_dense.todense())

In [33]:
user01_output

array([[1693, 2017, 2189, 1362,   34]], dtype=int64)

In [16]:
list_strains = []
for n in user01_output:
    for index in n:
        list_strains.append(index)

for n in list_strains:
    print(f"{data.loc[n,:]}\n")

strain_name                                                      Rafael
strain_type                                                      Sativa
strain_rating                                                         5
effect_profile                                                     None
flavor_profile                                                    Peach
strain_description    Rafael (or Raphael) is a sativa-dominant strai...
strain_profile                                        Sativa,None,Peach
Name: 1693, dtype: object

strain_name                                              Whitaker-Blues
strain_type                                                      Indica
strain_rating                                                       4.2
effect_profile                   Relaxed,Sleepy,Happy,Uplifted,Euphoric
flavor_profile                                         Sweet,Apple,Pear
strain_description    Whitaker Blues is a classic indica-dominant st...
strain_profile        Indica,Relaxed,

In [18]:
# Pickle the dtm and tf for web use
pickle.dump(dtm, open('knn01_dtm.pkl', 'wb'))
pickle.dump(tf, open('knn01_tf.pkl', 'wb'))