# Explore Cannabis Data

In [2]:
# ALL IMPORTS
import pandas as pd
import pickle
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import cloudpickle

nlp = spacy.load("en_core_web_md")

In [3]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


### Import Data

In [3]:
# Read in data
df = pd.read_csv('cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


### Wrangle Data

In [4]:
def list_effects(data=df.Effects):
    """Aggregate all effects associated with Effects column of DataFrame.
    
    Args:
        data (pandas.Series): DataFrame column to break down. Default set to 'Effects'
    Returns:
        effects_list (list): list of all effects under the Effects column
    """
    effects_list = []
    
    # Split each list of effects and append each, lowered effect to list
    for i in range(0, len(df)):
        effects = data[i].split(",")
        for effect in effects:
            effects_list.append(effect.lower())
    
    return effects_list

In [5]:
# Get unique list of all effects
all_effects = set(list_effects())
all_effects

{'aroused',
 'creative',
 'dry',
 'energetic',
 'euphoric',
 'focused',
 'giggly',
 'happy',
 'hungry',
 'mouth',
 'none',
 'relaxed',
 'sleepy',
 'talkative',
 'tingly',
 'uplifted'}

In [6]:
# Remove rows with NaN description values
df = df[~df["Description"].isnull()].reset_index()

In [7]:
# Remove rows with 'none' values from description
df = df[~(df.Description == 'None')].reset_index()

### Natural Language Processing

In [8]:
def preprocessor(doc):
    doc = nlp(doc)
    
    return " ".join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

df["processed"] = df["Description"].apply(preprocessor)

In [17]:
df["processed"]

0       $ 100 og 50/50 hybrid strain pack strong punch...
1       98 aloha white widow especially potent cut whi...
2       1024 sativa dominant hybrid breed spain medica...
3       13 dawgs hybrid g13 chemdawg genetic breed can...
4       know kosher tangie 24k gold 60 indica dominant...
                              ...                        
2309    zeus og   hybrid cross pineapple og deadhead o...
2310    zkittlez indica dominant mix grape ape grapefr...
2311    zombie kush ripper seeds come different kush g...
2312    look transform flesh eat monster zombie og cho...
2313    zoom pie know zombie pie heavy indica dominant...
Name: processed, Length: 2314, dtype: object

In [9]:
vect = TfidfVectorizer()
vect.fit(df['processed'])
dtm = vect.transform(df['processed'])

In [10]:
my_desc = ["I want something that tastes fruity and is very potent. I need something that will help with pain relief and help me feel calm."]
my_desc_vect = vect.transform(my_desc)

In [11]:
### Model
nn = NearestNeighbors(algorithm='kd_tree', n_neighbors=10, n_jobs=-1)
nn.fit(dtm)



NearestNeighbors(algorithm='kd_tree', n_jobs=-1, n_neighbors=10)

In [12]:
my_desc_vect

<1x7097 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [13]:
dist, ind = nn.kneighbors(my_desc_vect)

In [14]:
dist

array([[1.24141098, 1.29306533, 1.3192707 , 1.32066469, 1.3281847 ,
        1.3284485 , 1.3349045 , 1.33531259, 1.33604288, 1.33969241]])

In [15]:
ind

array([[ 300,  761,  161,  708, 1638,  314, 1531, 1784, 1011, 2304]],
      dtype=int64)

In [16]:
df["Description"][300]

'The fusion of Blueberry\xa0and Northern Lights, Blue Lights is\xa0an indica\xa0strain with a noteworthy THC content. This flower emits a blueberry pungency thanks to the Blueberry\xa0influence, and provides a combination of sedative and euphoric long-lasting effects. Although fitting for day or nighttime use, this strain may be the perfect choice for end of the day stress and/or pain relief.'

In [18]:
## Pickle objects
# preprocessor.__module__ = "predict"
# dill.dump(processor, open('../processor.pkl', 'wb'))
# pickle.dump(all_effects, open('../med-cabinet/static/data/effects_list.pkl', 'wb'))
pickle.dump(vect, open('../vectorizer.pkl', 'wb'))
pickle.dump(nn, open('../model.pkl', 'wb'))

In [15]:
import dill