In [1]:
# Load Data
import pandas as pd
import numpy as np
import os

data_path = '../csvs/lab_data.csv'
df = pd.read_csv(data_path)

In [2]:
def choose_description(row):
    # if description is not a string, set length to zero
    try:
        length_1 = len(row['Description 1'].split())
    except AttributeError:
        length_1 = 0
    try:
        length_2 = len(row['Description 1'].split())
    except AttributeError:
        length_2 = 0
    
    # return NaN if under 10 words in longest description
    if length_1 < 10 and length_2 < 10:
        return np.nan
    
    if length_1 >= length_2:
        return row['Description 1']
    
    return row['Description 2']

df['Description'] = df.apply(choose_description, axis=1)

In [3]:
def combine_descriptions(row):
    desc_1 = row['Description 1']
    desc_2 = row['Description 2']
    
    # if description is not a string, set it to empty string
    if isinstance(desc_1, float):
        desc_1 = ''
    if isinstance(desc_2, float):
        desc_2 = ''
    
    # return combined descrtiption unless length is zero, then return NaN
    combined_desc = desc_1 + ' ' + desc_2
    return combined_desc if combined_desc != ' ' else np.nan

df['Generated Description'] = df.apply(combine_descriptions, axis=1)

In [4]:
df.head()

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,THC Percent,Description 1,Description 2,Generated Description,Flavor,Effects,...,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description
0,sugar-cane,hybrid,0.4,0.6,0.2,Sugar Cane is a rare slightly sativa dominant ...,,Sugar Cane is a rare slightly sativa dominant ...,Earthy Sweet Candy Grape Spicy Fruity Herbal P...,Body High Cerebral Creative Energizing Relaxin...,...,,,0.66,,0.02,,1.0,0.0,0.0,Sugar Cane is a rare slightly sativa dominant ...
1,chemdawg,hybrid,0.55,0.45,0.19,With a near-even balance between sativa and in...,Chemdawg has developed quite the name for itse...,With a near-even balance between sativa and in...,Earthy Pungent Chemical Diesel Pine Diesel Ear...,Cerebral Creative Euphoria Happy Relaxing Cere...,...,,,0.561875,0.315,0.069,,19.0,0.0,0.0,With a near-even balance between sativa and in...
2,jack-herer,sativa,,,0.23,Jack Herer is easily one of the best-known str...,Jack Herer is a sativa-dominant cannabis strai...,Jack Herer is easily one of the best-known str...,Earthy Sweet Spicy Herbal Lemon Pine Woody Ear...,Body High Cerebral Creative Euphoria Happy Bod...,...,0.0,0.0,0.67875,0.283824,0.046667,13.0,114.0,1.0,2.0,Jack Herer is easily one of the best-known str...
3,green-dream,hybrid,0.5,0.5,0.235,"A sativa dominant hybrid, Green Dream is a cro...",Green Dream is a sativa-dominant hybrid cross ...,"A sativa dominant hybrid, Green Dream is a cro...",Earthy Citrus Blueberry Sweet Fruity Skunky Pi...,Creative Energizing Euphoria Happy Sociable Up...,...,0.0,0.0,0.2,0.1225,0.016667,,5.0,0.0,1.0,"A sativa dominant hybrid, Green Dream is a cro..."
4,lemon-skunk,hybrid,0.4,0.6,0.185,"With THC levels that reach 22% in some tests, ...",Lemon Skunk was conceived from two separate Sk...,"With THC levels that reach 22% in some tests, ...",Earthy Citrus Sweet Sour Lemon Skunky Pungent ...,Energizing Euphoria Giggly Happy Sociable Upli...,...,0.0,0.0,0.95,0.450455,0.054054,10.0,53.0,2.0,2.0,"With THC levels that reach 22% in some tests, ..."


In [5]:
# Load Spacy Model
import spacy

nlp = spacy.load("en_core_web_md")

def tokenize_text(text):
    return nlp(text)

In [6]:
df.Flavor = df.Flavor.fillna(' ')
df.Effects = df.Effects.fillna(' ')

df['mass_text'] = df.Strain + df.Effects + df.Flavor + df['Generated Description']

df.mass_text = df.mass_text.apply(tokenize_text)
df.mass_text[0:2]

0    (sugar, -, caneBody, High, Cerebral, Creative,...
1    (chemdawgCerebral, Creative, Euphoria, Happy, ...
Name: mass_text, dtype: object

In [7]:
def get_vector_from_doc(x):
    return x.vector

df['mass_vector'] = df.mass_text.apply(get_vector_from_doc)

vectors = df.mass_vector.apply(pd.Series)

vectors.shape

(1928, 300)

In [8]:
# Create Tree
from sklearn.neighbors import KDTree

kdtree = KDTree(vectors, leaf_size=30)

In [279]:
test_string = """Originating from the Hindu Kush mountains near the Afghanistan-Pakistan border, 
Afghan Kush is super relaxing and sleep-inducing. This, too, can help you feel hungry if you’re 
experiencing a lack of appetite, and can relieve pain.""" 

input_vector = get_vector_from_doc(
    tokenize_text(test_string)
)

input_vector = input_vector.reshape(1,-1)

num_matches = 5

dist, ind = kdtree.query(input_vector, k=num_matches, return_distance=True)

In [280]:
response = df.iloc[ind[0]]
response

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,THC Percent,Description 1,Description 2,Generated Description,Flavor,Effects,...,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description,mass_text,mass_vector
299,hindu-kush,indica,,,0.27,Hindu Kush is a 100% pure indica strain that i...,Hindu Kush is a pure indica strain named after...,Hindu Kush is a 100% pure indica strain that i...,Earthy Sweet Spicy Herbal Lemon Fragrant Pine ...,Euphoria Happy Hungry Relaxing Sleepy Euphoria...,...,0.486923,0.462,0.138,17.0,28.0,1.0,2.0,Hindu Kush is a 100% pure indica strain that i...,"(hindu, -, kushEuphoria, Happy, Hungry, Relaxi...","[-0.035556253, 0.21288761, -0.18429257, -0.031..."
632,super-kush,hybrid,0.5,0.5,0.185,First-time medical marijuana patients and occa...,"When you’re looking to wind down and relax, Su...",First-time medical marijuana patients and occa...,Earthy Citrus Floral Pine Mint Earthy Minty Fl...,Euphoria Focus Happy Relaxing Euphoria Focus H...,...,0.34,0.115,0.015,,0.0,0.0,2.0,First-time medical marijuana patients and occa...,"(super, -, kushEuphoria, Focus, Happy, Relaxin...","[-0.07780653, 0.22479416, -0.13820887, -0.0758..."
937,pure-afghan,indica,,,0.12,"As its name suggests, Pure Afghan is a pure in...",The Pure Afghan is an exclusive landrace from ...,"As its name suggests, Pure Afghan is a pure in...",Earthy Sweet Spicy Pine Mint Woody Earthy Wood...,Creative Focus Happy Hungry Relaxing Sleepy Cr...,...,,,,,1.0,0.0,0.0,"As its name suggests, Pure Afghan is a pure in...","(pure, -, afghanCreative, Focus, Happy, Hungry...","[-0.103919365, 0.23234372, -0.11760093, -0.076..."
1838,tahoe-og,hybrid,0.9,0.1,0.2,Tahoe OG offers a powerful mix of heady india ...,Tahoe OG is the perfect rainy day strain. Stro...,Tahoe OG offers a powerful mix of heady india ...,Earthy Pungent Spicy Lemon Pine Woody Earthy P...,"Sleepy Sleepy Relaxed,Sleepy,Happy,Euphoric,Hu...",...,0.595341,0.319,0.046515,13.113333,82.0,3.0,9.0,Tahoe OG offers a powerful mix of heady india ...,"(tahoe, -, ogSleepy, Sleepy, Relaxed, ,, Sleep...","[-0.085191116, 0.25720388, -0.12403674, -0.060..."
96,bubba-kush,indica,,,0.27,If you're craving a sweet smoke that leaves yo...,Bubba Kush is an indica strain that has gained...,If you're craving a sweet smoke that leaves yo...,Earthy Coffee Sweet Kush Spicy Herbal Pungent ...,Body High Cerebral Energizing Euphoria Happy H...,...,0.508085,0.121765,0.054,,107.0,0.0,8.0,If you're craving a sweet smoke that leaves yo...,"(bubba, -, kushBody, High, Cerebral, Energizin...","[-0.03654497, 0.1912716, -0.152698, -0.0827364..."


In [281]:
import pickle

with open('kdtree_model.pkl', 'wb') as f:
    pickle.dump(kdtree, f)

In [286]:
recommend = np.stack((dist, ind), axis=-1)
recommend

array([[[9.13212178e-01, 2.99000000e+02],
        [9.28226000e-01, 6.32000000e+02],
        [9.30978214e-01, 9.37000000e+02],
        [9.37172563e-01, 1.83800000e+03],
        [9.46582681e-01, 9.60000000e+01]]])