In [4]:
import pandas as pd

In [6]:
df = pd.read_csv('../csv/cannabis.csv')

In [7]:
df.head()

Unnamed: 0,id,name,medical,positive,negative,flavor,type,rating,description
0,0,Afpak,"Depression,Insomnia,Pain,Stress,Lack of Appetite","Relaxed,Creative,Focused,Sleepy,Happy",Dizzy,"Pine,Spicy/Herbal,Earthy",hybrid,4.2,"Afpak, named for its direct Afghani and Pakist..."
1,1,African,"Depression,Pain,Stress,Lack of Appetite,Nausea...","Euphoric,Energetic,Aroused,Tingly,Creative",Dry Mouth,"Spicy/Herbal,Pungent,Pepper",sativa,3.9,African refers to the indigenous varieties of ...
2,2,Afternoon-Delight,"Depression,Insomnia,Pain,Stress,Cramps,Headache","Talkative,Relaxed,Uplifted,Tingly,Creative","Dizzy,Dry Mouth,Paranoid","Pungent,Citrus,Tropical",hybrid,4.8,"Afternoon Delight, created by Colorado Seed In..."
3,3,Afwreck,"Pain,Stress,Headache,Fatigue,Headaches,Muscle ...","Euphoric,Happy,Uplifted,Relaxed,Sleepy","Dizzy,Dry Mouth,Paranoid,Dry Eyes","Earthy,Pine,Pungent",hybrid,4.2,Afwreck is a hybrid cross of Afghani and Train...
4,4,Agent-Orange,"Depression,Pain,Stress,Nausea,Headache,Headaches","Happy,Uplifted,Relaxed,Energetic,Euphoric","Dizzy,Dry Mouth,Paranoid,Dry Eyes","Citrus,Orange,Earthy",hybrid,4.2,Don’t let the name scare you! The only herbici...


In [8]:
df.shape

(2802, 9)

In [9]:
df['description'].isnull().sum()

484

In [10]:
df = df.dropna()

In [11]:
# Turn pandas column into list
text_list = df['description'].tolist()
len(text_list)

1198

In [12]:
text_list[0]

'Afpak, named for its direct Afghani and Pakistani landrace heritage, is a beautiful indica-dominant hybrid with light green and deep bluish purple leaves. The taste and aroma are floral with a touch of lemon, making the inhale light and smooth. Its effects start in the stomach by activating the appetite. There is also a potent relaxation that starts in the head and face, and gradually sinks down into the body. Enjoy this strain if you’re suffering from stress, mild physical discomfort, or having difficulty eating. \xa0'

In [13]:
import spacy
import re

# Define tokenization function
def tokenize(doc):
    tokens = re.sub('[^a-zA-Z 0-9]', '', doc)
    tokens = tokens.lower().split()
    return tokens

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [15]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english',
                        tokenizer = tokenize,
                        ngram_range = (1,2),
                        min_df = 7, 
                        max_df = 0.6)

# Create vocab and tf-idf score
dtm = tfidf.fit_transform(text_list)

# Get feature names to use as df column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# Show feature matrix as dataframe
print(dtm.shape)
dtm.head()

(1198, 1637)


Unnamed: 0,1,10,10 weeks,11,11 weeks,12,13,14,15,1980s,...,world,worth,wreck,x,years,yield,yields,youre,youre looking,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176862,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.223492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Pickle dtm

In [None]:
# import pickle

# with open('nlp_dtm.pkl', 'wb') as nlp_pkl_file:
#     pickle.dump(dtm, nlp_pkl_file)

In [2]:
# # Load in dtm from pickled format
# with open('nlp_dtm.pkl', 'rb') as nlp_pkl_file:
#     dtm = pickle.load(nlp_pkl_file)

## Try model with test description

In [16]:
test_description = ["My name is Bobby. The only kind of cannabis I've used was a light green and I think it came from Northeast Asia. I don't remember what it was called."]
test_description

["My name is Bobby. The only kind of cannabis I've used was a light green and I think it came from Northeast Asia. I don't remember what it was called."]

In [17]:
# Fit on DTM with 5 nn
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [18]:
# Query data for similar descriptions
new = tfidf.transform(test_description)
new

<1x1637 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [21]:
# 5 most similar strain descriptions
probs, strain_nums = nn.kneighbors(new.todense())

In [22]:
probs

array([[1.        , 1.25139974, 1.26037323, 1.26785543, 1.26988953]])

In [34]:
strain_nums[0]

array([922, 214, 719,   5, 501])

In [50]:
# Look at one of the matches -- Observation: Also a "light green" color
text_list[5]

'For those craving a cerebral buzz with a citrus kick, 3C Agent Tangie is perfect. The glittery colas are light green with a zesty, floral flavor. Its effects linger in the crown of the skull and disperse throughout the body in steady waves of invigoration. This strain may assist those suffering from perpetual procrastination, depression, and fatigue.\xa0'