In [1]:
!pip install spacy



In [2]:
"""
Preditive model for predicting best strains to match input variables for
Med-Cab-2020
"""


# IMPORTS
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import en_core_web_sm
from spacy import load

In [3]:
df = pd.read_csv('cannabis_strain.csv')

In [4]:
# Dropping duplicated column
df = df.drop('Unnamed: 0', axis= 1)

In [5]:
df.head(3)

Unnamed: 0,strain_id,strain,type,Rating,effects,description,Flavors
0,1838,silver nina,hybrid,5.0,"creative, energetic, focused, happy, aroused",silver nina by colorado seed inc. is the stabi...,citrus
1,1834,silver calyx,hybrid,5.0,"aroused, energetic, euphoric, focused, relaxed",silver calyx by calyx garden is a balanced hyb...,"lemon, tree, fruit, pine"
2,1779,royal tree sherbet,hybrid,5.0,relaxed,sherbet by royal tree gardens is another genet...,"tropical, sweet, berry"


In [6]:
nlp= en_core_web_sm.load()

In [7]:
# tokenized function
def tokenizer(text):
    doc=nlp(text)
    return [token.lemma_ for token in doc if ((token.is_stop == False) and
    (token.is_punct == False)) and (token.pos_ != 'PRON')]

In [8]:
# builds the model
model = TfidfVectorizer(stop_words = 'english',
                        ngram_range = (1,2),
                        max_df = .95,
                        min_df = 3,
                        tokenizer = tokenizer)

In [9]:
# Fit and transform the data:
dtm = model.fit_transform(df['effects'])

# Get features:
dtm = pd.DataFrame(dtm.todense(), columns = model.get_feature_names())

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [10]:
filename = 'model.pkl'
# Open the file to save as pkl file
model_pkl = open(filename, 'wb')
pickle.dump(nn, model_pkl)
# Close the pickle instances
model_pkl.close()

In [11]:
def train(user_input):
    '''
    Gets str input from user.
    Vectorized the input. Use pikeled model
    to get into the databse and then return the nearest 
    neighbour
    '''
    model_pkl = open(filename, 'rb')
    nn2 = pickle.load(model_pkl)
    #nn2 = load('model.joblib')
    sample = [user_input]
    vec = model.transform(sample)
    dense = vec.todense()
    similar = nn2.kneighbors(dense, return_distance=False)
    similar.T

    output = []
    for i in range(5):
        elem = similar[0][i]
        output.append(elem)


    return output[0]

In [16]:
train('i want something to help me feel happy')

2308

In [17]:
df[df['strain_id'] == 2308]

Unnamed: 0,strain_id,strain,type,Rating,effects,description,Flavors
198,2308,zellys gift,sativa,5.0,"happy, uplifted, energetic, relaxed, euphoric",zelly’s gift by dragonfly earth medicine is a ...,"sweet, citrus, lime"
