# Explore Cannabis Data

In [1]:
# Standard Library Imports
import pickle

# Third-Party Imports
import spacy
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Download spaCy model
# !python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [33]:
# Load spaCy model
nlp = spacy.load('en_core_web_md')

### Import Data

In [2]:
## Read in data from local csv
# Location if not present: https://www.kaggle.com/kingburrito666/cannabis-strains
df = pd.read_csv('cannabis.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


### Wrangle Data

In [19]:
def list_effects(data=df.Effects):
    """Aggregate all unique effects associated with Effects column of DataFrame.
    
    Args:
        data (pandas.Series): DataFrame column to break down. Default set to 'Effects'
    Returns:
        effects_list (list): list of all effects under the Effects column
    """
    effects_list = []
    
    # Split each list of effects and append each, lowered effect to list
    for i in range(0, len(df)):
        effects = data[i].split(",")
        for effect in effects:
            effects_list.append(effect.lower())
    
    return set(effects_list)

In [21]:
# Get unique list of all effects
all_effects = list_effects()
all_effects

{'aroused',
 'creative',
 'dry',
 'energetic',
 'euphoric',
 'focused',
 'giggly',
 'happy',
 'hungry',
 'mouth',
 'none',
 'relaxed',
 'sleepy',
 'talkative',
 'tingly',
 'uplifted'}

In [22]:
# Remove some unnecessary effects
bad_effects = ['dry', 'mouth', 'aroused', 'none']
for effect in bad_effects:
    all_effects.remove(effect)

In [27]:
# Capitalize each effect for future presentation
effects_list = []

for effect in all_effects:
    effects_list.append(effect.capitalize())

['Euphoric',
 'Focused',
 'Uplifted',
 'Tingly',
 'Energetic',
 'Creative',
 'Relaxed',
 'Happy',
 'Hungry',
 'Talkative',
 'Sleepy',
 'Giggly']

In [38]:
# Remove rows with NaN description values
df = df[~df['Description'].isnull()].reset_index()

In [39]:
# Remove rows with 'none' description values
df = df[~(df.Description == 'None')].reset_index()

### Natural Language Processing

In [40]:
def preprocessor(doc):
    """Preprocess input text data using spaCy functionality.

    Args:
        doc (list): List of input data to be processed
    Returns:
        new_text (str): New processed document
    """
    doc = nlp(doc)
    new_text = " ".join([token.lemma_.lower() for token in doc if not
                         token.is_stop and not token.is_punct])

    return new_text

# Apply preprocess to data and save to new column
df['Processed'] = df['Description'].apply(preprocessor)

In [60]:
# Instantiate vectorizer and fit it with processed data
vect = TfidfVectorizer(ngram_range=(1, 1))
vect.fit(df['Processed'])

# Transform processed data for fitting of nn model
dtm = vect.transform(df['Processed'])

In [61]:
# Instatiate model and fit it with vectorized data
nn = NearestNeighbors(algorithm='kd_tree', n_neighbors=10, n_jobs=-1)
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', n_jobs=-1, n_neighbors=10)

In [62]:
# Create fake description to test model performance
my_desc = "I want something that tastes fruity and is very potent. I need something that will help with pain relief and help me feel calm."

# Process and vectorize fake description
my_desc = preprocessor(my_desc)
my_desc_vect = vect.transform([my_desc])

In [63]:
# Find nearest neighbors of fake description
dist, ind = nn.kneighbors(my_desc_vect)

In [64]:
# List out neighbor distances
dist

array([[1.26711912, 1.26878204, 1.27096056, 1.28309896, 1.28430284,
        1.29239124, 1.29313372, 1.29376705, 1.29490009, 1.30055682]])

In [65]:
# List out neighbor indices
ind

array([[1638,  161,  708,  314, 2034,   76, 1531, 1784, 1011, 2304]],
      dtype=int64)

In [66]:
# Compare model results with fake description
df["Description"][1638]

'When it comes to knocking out pain, no medical strain hits the target quite like Purple Arrow. This hybrid provides effective relief for severe pain while simultaneously inducing a sense of uplift and euphoria. Extremely well-rounded, Purple Arrow is potent without causing that over-medicated feeling of some pain relief strains. The uniqueness of this strain is complemented by its earthy aroma. Fragrant, herbal, and a little sweet, this strain tastes almost as good as it feels. When you need immediate relief and would like to stay off the couch, Purple Arrow is a fantastic choice.'

In [29]:
## Pickle objects
pickle.dump(effects_list, open('../med-cabinet/static/data/effects_list.pkl', 'wb'))
pickle.dump(vect, open('../vectorizer.pkl', 'wb'))
pickle.dump(nn, open('../model.pkl', 'wb'))