In [3]:
# Load Data
import pandas as pd
import numpy as np
import os

data_path = '../csvs/lab_data.csv'
df = pd.read_csv(data_path)

In [4]:
def choose_description(row):
    # if description is not a string, set length to zero
    try:
        length_1 = len(row['Description 1'].split())
    except AttributeError:
        length_1 = 0
    try:
        length_2 = len(row['Description 1'].split())
    except AttributeError:
        length_2 = 0
    
    # return NaN if under 10 words in longest description
    if length_1 < 10 and length_2 < 10:
        return np.nan
    
    if length_1 >= length_2:
        return row['Description 1']
    
    return row['Description 2']

df['Description'] = df.apply(choose_description, axis=1)

In [5]:
def combine_descriptions(row):
    desc_1 = row['Description 1']
    desc_2 = row['Description 2']
    
    # if description is not a string, set it to empty string
    if isinstance(desc_1, float):
        desc_1 = ''
    if isinstance(desc_2, float):
        desc_2 = ''
    
    # return combined descrtiption unless length is zero, then return NaN
    combined_desc = desc_1 + ' ' + desc_2
    return combined_desc if combined_desc != ' ' else np.nan

df['Generated Description'] = df.apply(combine_descriptions, axis=1)

In [6]:
df.head()

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,THC Percent,Description 1,Description 2,Generated Description,Flavor,Effects,...,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description
0,sugar-cane,hybrid,0.4,0.6,0.2,Sugar Cane is a rare slightly sativa dominant ...,,Sugar Cane is a rare slightly sativa dominant ...,Earthy Sweet Candy Grape Spicy Fruity Herbal P...,Body High Cerebral Creative Energizing Relaxin...,...,,,0.66,,0.02,,1.0,0.0,0.0,Sugar Cane is a rare slightly sativa dominant ...
1,chemdawg,hybrid,0.55,0.45,0.19,With a near-even balance between sativa and in...,Chemdawg has developed quite the name for itse...,With a near-even balance between sativa and in...,Earthy Pungent Chemical Diesel Pine Diesel Ear...,Cerebral Creative Euphoria Happy Relaxing Cere...,...,,,0.561875,0.315,0.069,,19.0,0.0,0.0,With a near-even balance between sativa and in...
2,jack-herer,sativa,,,0.23,Jack Herer is easily one of the best-known str...,Jack Herer is a sativa-dominant cannabis strai...,Jack Herer is easily one of the best-known str...,Earthy Sweet Spicy Herbal Lemon Pine Woody Ear...,Body High Cerebral Creative Euphoria Happy Bod...,...,0.0,0.0,0.67875,0.283824,0.046667,13.0,114.0,1.0,2.0,Jack Herer is easily one of the best-known str...
3,green-dream,hybrid,0.5,0.5,0.235,"A sativa dominant hybrid, Green Dream is a cro...",Green Dream is a sativa-dominant hybrid cross ...,"A sativa dominant hybrid, Green Dream is a cro...",Earthy Citrus Blueberry Sweet Fruity Skunky Pi...,Creative Energizing Euphoria Happy Sociable Up...,...,0.0,0.0,0.2,0.1225,0.016667,,5.0,0.0,1.0,"A sativa dominant hybrid, Green Dream is a cro..."
4,lemon-skunk,hybrid,0.4,0.6,0.185,"With THC levels that reach 22% in some tests, ...",Lemon Skunk was conceived from two separate Sk...,"With THC levels that reach 22% in some tests, ...",Earthy Citrus Sweet Sour Lemon Skunky Pungent ...,Energizing Euphoria Giggly Happy Sociable Upli...,...,0.0,0.0,0.95,0.450455,0.054054,10.0,53.0,2.0,2.0,"With THC levels that reach 22% in some tests, ..."


In [7]:
# Load Spacy Model
import spacy

nlp = spacy.load("en_core_web_lg")

def tokenize_text(text):
    return nlp(text)

In [8]:
df.Flavor = df.Flavor.fillna(' ')
df.Effects = df.Effects.fillna(' ')

df['mass_text'] = df.Strain + df.Effects + df.Flavor + df['Generated Description']

df.mass_text = df.mass_text.apply(tokenize_text)
df.mass_text[0:2]

0    (sugar, -, caneBody, High, Cerebral, Creative,...
1    (chemdawgCerebral, Creative, Euphoria, Happy, ...
Name: mass_text, dtype: object

In [9]:
def get_vector_from_doc(x):
    return x.vector

df['mass_vector'] = df.mass_text.apply(get_vector_from_doc)

vectors = df.mass_vector.apply(pd.Series)

vectors.shape

(1928, 300)

In [10]:
# Create Tree
from sklearn.neighbors import KDTree

kdtree = KDTree(vectors, leaf_size=30)

In [11]:
test_string = """The strain produces a citrus sweet, often described as red grapefruit,
flavor that is tinged with just a bit of diesel. Such a rare taste delivers a powerful
high that most often energizes users and activates their minds. """ 

input_vector = get_vector_from_doc(
    tokenize_text(test_string)
)

input_vector = input_vector.reshape(1,-1)

num_matches = 5

dist, ind = kdtree.query(input_vector, k=num_matches)

In [12]:
response = df.iloc[ind[0]]
response

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,THC Percent,Description 1,Description 2,Generated Description,Flavor,Effects,...,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs,Description,mass_text,mass_vector
1715,cookie-og,hybrid,0.5,0.5,0.2,Cookie OG is a hybrid strain containing a mixt...,,Cookie OG is a hybrid strain containing a mixt...,Sweet Blueberry Grape Fruity Herbal Berry,Euphoria Happy Relaxing Uplifting Euphoria Hap...,...,0.65,0.13,0.0,,0.0,0.0,1.0,Cookie OG is a hybrid strain containing a mixt...,"(cookie, -, ogEuphoria, Happy, Relaxing, Uplif...","[-0.062157538, 0.22732162, -0.13716613, -0.081..."
917,glass-slipper,hybrid,0.5,0.5,0.2,This sativa dominant hybrid is a genetic breed...,A cross between Cinderella 99 and Pineapple 99...,This sativa dominant hybrid is a genetic breed...,Citrus Sweet Pungent Fruity Berry Pine Sweet E...,Creative Energizing Euphoria Happy Relaxing Up...,...,0.395,,0.035,,2.0,0.0,0.0,This sativa dominant hybrid is a genetic breed...,"(glass, -, slipperCreative, Energizing, Euphor...","[-0.060501177, 0.23199397, -0.099958934, -0.06..."
1857,c4,hybrid,0.5,0.5,0.185,C4 is a hybrid strain and is a cross between S...,An indica-leaning cross between Cotton Candy a...,C4 is a hybrid strain and is a cross between S...,Citrus Sweet Fruity Lemon Cheesy Pine Earthy F...,Cerebral Creative Energizing Euphoria Focus Ha...,...,0.185,0.31,0.033333,,5.0,0.0,0.0,C4 is a hybrid strain and is a cross between S...,"(c4Cerebral, Creative, Energizing, Euphoria, F...","[-0.06593755, 0.24660586, -0.1487888, -0.08672..."
206,chernobyl,hybrid,0.2,0.8,0.19,Chernobyl is a sativa dominant strain with a 2...,Chernobyl is a sativa-dominant hybrid strain t...,Chernobyl is a sativa dominant strain with a 2...,Earthy Sweet Citrus Sour Tropical Cherry Lemon...,Cerebral Energizing Euphoria Uplifting Cerebra...,...,0.438854,0.180263,0.065135,11.44,107.0,4.0,5.0,Chernobyl is a sativa dominant strain with a 2...,"(chernobylCerebral, Energizing, Euphoria, Upli...","[-0.046459015, 0.23314486, -0.11908292, -0.066..."
588,lemon-lime-kush,hybrid,0.5,0.5,,"Rated as second grade reefer by users, the Lem...",,"Rated as second grade reefer by users, the Lem...",Spicy Lemon Citrus Lime,Euphoria Focus Happy Euphoria Focus Happy,...,0.41,,0.01,,1.0,0.0,0.0,"Rated as second grade reefer by users, the Lem...","(lemon, -, lime, -, kushEuphoria, Focus, Happy...","[-0.049964868, 0.195065, -0.12459607, -0.08451..."


In [13]:
import pickle

with open('kdtree_model.pkl', 'wb') as f:
    pickle.dump(kdtree, f)