In [1]:
# Load Data
import pandas as pd
import numpy as np
import os

data_path = '../csvs/lab_data_final.csv'
df = pd.read_csv(data_path)

In [2]:
def choose_description(row):
    # if description is not a string, set length to zero
    try:
        length_1 = len(row['Description 1'].split())
    except AttributeError:
        length_1 = 0
    try:
        length_2 = len(row['Description 1'].split())
    except AttributeError:
        length_2 = 0
    
    # return NaN if under 10 words in longest description
    if length_1 < 10 and length_2 < 10:
        return np.nan
    
    if length_1 >= length_2:
        return row['Description 1']
    
    return row['Description 2']

df['Description'] = df.apply(choose_description, axis=1)

In [3]:
def combine_descriptions(row):
    desc_1 = row['Description 1']
    desc_2 = row['Description 2']
    
    # if description is not a string, set it to empty string
    if isinstance(desc_1, float):
        desc_1 = ''
    if isinstance(desc_2, float):
        desc_2 = ''
    
    # return combined descrtiption unless length is zero, then return NaN
    combined_desc = desc_1 + ' ' + desc_2
    return combined_desc if combined_desc != ' ' else np.nan

df['Generated Description'] = df.apply(combine_descriptions, axis=1)

In [4]:
df.head()

Unnamed: 0,id,strain,effect,medical_effect,medical_effect_plain,flavor,terpene,Type,Percent Indica,Percent Sativa,...,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description
0,1,sugar-cane,"Body High, Cerebral, Creative, Energetic, Rela...","Analgesic, Antibacterial, Anti inflammatory","Pain relief, Inhibits bacteria, Reduces inflam...","Candy, Earthy, Fruity, Grape, Herbal, Pungent,...","fruity, sweet, earthy, spicy, other",hybrid,0.4,0.6,...,,,0.66,,0.02,,1.0,0.0,0.0,Sugar Cane is a rare slightly sativa dominant ...
1,2,chemdawg,"Cerebral, Creative, Euphoric, Happy, Relaxed, ...","Analgesic, Antibacterial, Antidepressant, Anti...","Pain relief, Inhibits bacteria, Reduces depres...","Chemical, Diesel, Earthy, Pine, Pungent","chemical, earthy, other",hybrid,0.55,0.45,...,,,0.561875,0.315,0.069,,19.0,0.0,0.0,With a near-even balance between sativa and in...
2,3,jack-herer,"Body High, Cerebral, Creative, Energetic, Euph...","Analgesic, Antibacterial, Antidepressant, Anti...","Pain relief, Inhibits bacteria, Reduces depres...","Earthy, Herbal, Lemon, Pine, Spicy, Sweet, Woody","citrus, sweet, earthy, spicy",sativa,,,...,0.0,0.0,0.67875,0.283824,0.046667,13.0,114.0,1.0,2.0,Jack Herer is easily one of the best-known str...
3,4,green-dream,"Creative, Energetic, Euphoric, Happy, Hungry, ...","Analgesic, Antiepileptic, Anti inflammatory, A...","Pain relief, Antiepileptic, Reduces inflammati...","Blueberry, Citrus, Earthy, Floral, Fruity, Pin...","fruity, berry, sweet, earthy, other",hybrid,0.5,0.5,...,0.0,0.0,0.2,0.1225,0.016667,,5.0,0.0,1.0,"A sativa dominant hybrid, Green Dream is a cro..."
4,5,lemon-skunk,"Energetic, Euphoric, Focused, Giggly, Happy, R...","Analgesic, Antibacterial, Antidepressant, Anti...","Pain relief, Inhibits bacteria, Reduces depres...","Citrus, Earthy, Lemon, Pungent, Skunk, Sour, S...","citrus, sweet, earthy, other",hybrid,0.4,0.6,...,0.0,0.0,0.95,0.450455,0.054054,10.0,53.0,2.0,2.0,"With THC levels that reach 22% in some tests, ..."


In [5]:
df.columns

Index(['id', 'strain', 'effect', 'medical_effect', 'medical_effect_plain',
       'flavor', 'terpene', 'Type', 'Percent Indica', 'Percent Sativa',
       'THC Percent', 'Description 1', 'Description 2',
       'Generated Description', 'Rating', 'labs', 'cis-Nerolidol',
       'trans-Nerolidol', 'trans-Nerolidol 1', 'trans-Nerolidol 2',
       'trans-Ocimene', '3-Carene', 'Camphene', 'Caryophyllene Oxide',
       'Eucalyptol', 'Geraniol', 'Guaiol', 'Isopulegol', 'Linalool', 'Ocimene',
       'Terpinolene', 'alpha-Bisabolol', 'alpha-Humulene', 'alpha-Pinene',
       'alpha-Terpinene', 'beta-Caryophyllene', 'beta-Myrcene', 'beta-Ocimene',
       'beta-Pinene', 'delta-Limonene', 'gamma-Terpinene', 'p-Cymene',
       'delta-9 THC-A', 'delta-9 THC', 'delta-8 THC', 'THC-A', 'THCV', 'CBN',
       'CBD-A', 'CBD', 'CBDV', 'CBDV-A', 'delta-9 CBG-A', 'delta-9 CBG', 'CBC',
       'Moisture Content', 'ana360', 'psilabs', 'sclabs', 'Description'],
      dtype='object')

In [6]:
# Load Spacy Model
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    return nlp(text)

In [7]:
df.flavor = df.flavor.fillna(' ')
df.effect = df.effect.fillna(' ')
df.medical_effect = df.medical_effect.fillna(' ')
df.medical_effect_plain = df.medical_effect_plain.fillna(' ')

def strip_comma(x):
    return x.strip(',')

df.flavor = df.flavor.apply(strip_comma)
df.effect = df.effect.apply(strip_comma)
df.medical_effect = df.medical_effect.apply(strip_comma)
df.medical_effect_plain = df.medical_effect_plain.apply(strip_comma)

df['mass_text'] = (df.strain + df.effect + df.flavor + df.Type + 
                   df.medical_effect + df.medical_effect_plain + 
                   df['Generated Description'])

df.mass_text = df.mass_text.apply(tokenize_text)
df.mass_text[0:2]

0    (sugar, -, caneBody, High, ,, Cerebral, ,, Cre...
1    (chemdawgCerebral, ,, Creative, ,, Euphoric, ,...
Name: mass_text, dtype: object

In [8]:
def get_vector_from_doc(x):
    return x.vector

df['mass_vector'] = df.mass_text.apply(get_vector_from_doc)

vectors = df.mass_vector.apply(pd.Series)

vectors.shape

(1928, 96)

In [9]:
# Create Tree
from sklearn.neighbors import KDTree

kdtree = KDTree(vectors, leaf_size=30, metric='euclidean')

In [18]:
kdtree.valid_metrics

['euclidean',
 'l2',
 'minkowski',
 'p',
 'manhattan',
 'cityblock',
 'l1',
 'chebyshev',
 'infinity']

In [14]:
test_string = """Originating from the Hindu Kush mountains near the Afghanistan-Pakistan border, 
Afghan Kush is super relaxing and sleep-inducing. This, too, can help you feel hungry if you’re 
experiencing a lack of appetite, and can relieve pain.""" 

input_vector = get_vector_from_doc(
    tokenize_text(test_string)
)

input_vector = input_vector.reshape(1,-1)

num_matches = 5

dist, ind = kdtree.query(input_vector, k=num_matches)

In [15]:
response = df.iloc[ind[0]]
response

Unnamed: 0,id,strain,effect,medical_effect,medical_effect_plain,flavor,terpene,Type,Percent Indica,Percent Sativa,...,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description,mass_text,mass_vector
658,659,yogi-diesel,"Creative, Energetic, Euphoric, Focused, Happy,...","Antidepressant, Antifungal, Anti inflammatory,...","Reduces depression, Inhibits fungal growth, Re...","Chemical, Citrus, Diesel, Earthy, Herbal, Lemo...","chemical, citrus, earthy, minty, spicy",hybrid,0.1,0.9,...,,0.19,,,1.0,0.0,0.0,The mix of Sour Diesel and Northern Lights #5 ...,"(yogi, -, dieselCreative, ,, Energetic, ,, Eup...","[0.016228417, 0.627516, -0.46759322, -0.319592..."
347,348,burmese-kush,"Creative, Energetic, Euphoric, Focused, Happy,...","Analgesic, Anti inflammatory, Anti insomnia, A...","Pain relief, Reduces inflammation, Aids sleep,...","Berry, Earthy, Grapefruit, Herbal, Lemon, Pine...","fruity, berry, citrus, sweet, earthy, spicy, o...",hybrid,0.5,0.5,...,0.393333,0.184,0.025,,13.0,0.0,1.0,Burmese Kush is a hybrid strain with an incred...,"(burmese, -, kushCreative, ,, Energetic, ,, Eu...","[0.25065, 0.8385978, -0.45358747, -0.5841868, ..."
47,48,dr-who,"Body High, Creative, Euphoric, Happy, Relaxed,...","Analgesic, Antiepileptic, Antifungal, Anti inf...","Pain relief, Antiepileptic, Inhibits fungal gr...","Berry, Citrus, Earthy, Fruity, Grape, Pineappl...","fruity, berry, tropical, sweet, earthy, other",hybrid,0.6,0.4,...,0.475714,0.11,0.054444,,14.0,0.0,0.0,Dr. Who is mostly offered in the form of small...,"(dr, -, whoBody, High, ,, Creative, ,, Euphori...","[0.298657, 0.6828496, -0.5641302, -0.50520897,..."
1700,1701,platinum-bubba-kush,"Creative, Euphoric, Happy, Hungry, Relaxed, Sl...","Analgesic, Antiepileptic, Antifungal, Anti inf...","Pain relief, Antiepileptic, Inhibits fungal gr...","Earthy, Floral, Kush, Pungent, Spicy, Sweet, W...","sweet, earthy, flowery, spicy, other",hybrid,0.8,0.2,...,0.27,0.11,0.06,,5.0,0.0,0.0,"An intense and powerful indica strain, Platinu...","(platinum, -, bubba, -, kushCreative, ,, Eupho...","[0.24639967, 0.37602234, -0.6537138, -0.253788..."
1854,1855,lemon-drop,"Energetic, Euphoric, Focused, Giggly, Happy, H...","Analgesic, Antidepressant, Antiepileptic, Anti...","Pain relief, Reduces depression, Antiepileptic...","Citrus, Dank, Earthy, Grapefruit, Lemon, Pine,...","fruity, tropical, citrus, sweet, earthy, other",hybrid,0.4,0.6,...,0.58,0.23,0.085,,3.0,0.0,0.0,Lemon Drop is a sativa dominant strain with a ...,"(lemon, -, dropEnergetic, ,, Euphoric, ,, Focu...","[0.23421429, 0.8101261, -0.7043219, -0.5070593..."


In [16]:
import pickle

with open('kdtree_model_1.2.pkl', 'wb') as f:
    pickle.dump(kdtree, f)

In [17]:
recommend = np.stack((dist, ind), axis=-1)
recommend

array([[[   2.68888238,  658.        ],
        [   2.7022204 ,  347.        ],
        [   2.70237595,   47.        ],
        [   2.70481701, 1700.        ],
        [   2.70938608, 1854.        ]]])