# Explore Cannabis Data

In [2]:
# ALL IMPORTS
import pandas as pd
import pickle
import spacy
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_md")

In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz (50.8 MB)
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_md')


### Import Data

In [28]:
# Read in data
df = pd.read_csv('cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


### Wrangle Data

In [4]:
def list_effects(data=df.Effects):
    """Aggregate all effects associated with Effects column of DataFrame.
    
    Args:
        data (pandas.Series): DataFrame column to break down. Default set to 'Effects'
    Returns:
        effects_list (list): list of all effects under the Effects column
    """
    effects_list = []
    
    # Split each list of effects and append each, lowered effect to list
    for i in range(0, len(df)):
        effects = data[i].split(",")
        for effect in effects:
            effects_list.append(effect.lower())
    
    return effects_list

In [5]:
# Get unique list of all effects
all_effects = set(list_effects())
all_effects

{'aroused',
 'creative',
 'dry',
 'energetic',
 'euphoric',
 'focused',
 'giggly',
 'happy',
 'hungry',
 'mouth',
 'none',
 'relaxed',
 'sleepy',
 'talkative',
 'tingly',
 'uplifted'}

In [30]:
# Remove rows with NaN description values
df = df[~df["Description"].isnull()].reset_index()

In [35]:
# Remove rows with 'none' values from description
df = df[~(df.Description == 'None')].reset_index()

### Natural Language Processing

In [9]:
def preprocessor(doc):
    doc = nlp(doc)
    
    return " ".join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

In [36]:
vect = TfidfVectorizer(preprocessor=preprocessor)
vect.fit(df['Description'])
dtm = vect.transform(df['Description'])

In [37]:
my_desc = ["I want something that tastes fruity and is very potent. I need something that will help with pain relief and help me feel calm."]
my_desc_vect = vect.transform(my_desc)

In [38]:
### Model
nn = NearestNeighbors(algorithm='kd_tree', n_neighbors=10, n_jobs=-1)
nn.fit(dtm)



NearestNeighbors(algorithm='kd_tree', n_jobs=-1, n_neighbors=10)

In [39]:
dist, ind = nn.kneighbors(my_desc_vect)

In [40]:
dist

array([[1.26576484, 1.26750438, 1.26808735, 1.28384111, 1.28600223,
        1.28867392, 1.2946406 , 1.29605286, 1.29626243, 1.29645915]])

In [41]:
ind

array([[ 161,  708, 1638,  314, 2034, 1531,   76, 1011, 1784, 2304]],
      dtype=int64)

In [43]:
df["Description"][708]

'Dreamer’s Glass is a hybrid cannabis strain that delivers dreamy euphoric effects alongside heavy body effects that let you settle deeply in relaxation. This strain is great for patients needing potent relief of nausea and mild pain, but expect to feel more and more anchored to your couch as you increase the dose.'

In [45]:
## Pickle objects
pickle.dump(all_effects, open('../med-cabinet/static/data/effects_list.pkl', 'wb'))
pickle.dump(vect, open('../vectorizer.pkl', 'wb'))
pickle.dump(nn, open('../model.pkl', 'wb'))