In [39]:
import spacy
import pandas as pd
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
url = "https://raw.githubusercontent.com/MedicinalCabinet/DataScience/master/Data/cannabis.csv"

In [5]:
df = pd.read_csv(url)

In [8]:
df.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

In [64]:
df1 = df.dropna()
label = df1[df1["Description"]=="None"].index
#Drop all rows that have none values in all rows
df1 = df1.drop(index=label)

In [10]:
df1.isnull().sum()

Strain         0
Type           0
Rating         0
Effects        0
Flavor         0
Description    0
dtype: int64

In [12]:
df1.dtypes

Strain          object
Type            object
Rating         float64
Effects         object
Flavor          object
Description     object
dtype: object

None                        110
Earthy,Sweet,Pungent         25
Sweet,Earthy,Pungent         19
Earthy,Pungent,Sweet         19
Earthy,Pungent,Woody         19
                           ... 
Sweet,Minty,Earthy            1
Pineapple,Tropical,Sweet      1
Lime,Pine,Spicy/Herbal        1
Tobacco,Earthy                1
Lemon,Earthy                  1
Name: Flavor, Length: 1278, dtype: int64

In [13]:
df1.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [16]:
#Create the nlp object
nlp = spacy.load("en_core_web_lg")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [17]:
def tokenize(doc):
    """Return the tokens"""
    return [token.text for token in tokenizer(doc)]

def get_lemmas(text):
    """Return the Lemmas"""
    lemmas = []
    doc = nlp(text)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

In [65]:
df1['alltext'] = df1['Effects'].str.cat(df1["Flavor"], sep=" ")

In [66]:
df1['alltext'] = df1['alltext'].str.cat(df1["Description"], sep=" ")

In [67]:
df1["alltext"][0]

'Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus $100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

In [68]:
text = df1["alltext"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2273, 440)


Unnamed: 0,Unnamed: 1,1,10,10 week,1st,20,7,70,8,8 9,...,way,week,white,white widow,widow,win,woody,x,yield,Unnamed: 21
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.048488,0.0,0.0,0.0,0.134054,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.385739,0.493677,0.490975,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.161058,0.0,0.0,0.0
3,0.085556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.156774,0.0,0.0,0.0
4,0.075374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
# Fit on TF-IDF Vectors
nn  = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
nn.fit(dtm)

review = [""" I have insominia and need something to help me fall asleep
"""]

new = tfidf.transform(review)
nn.kneighbors(new.todense())

(array([[1.16833771, 1.2222853 , 1.23234611, 1.23841935, 1.23874699,
         1.24745394, 1.25180746, 1.25417073, 1.25819497, 1.26487478]]),
 array([[1292, 1821, 1450,  255,  960,  557, 1635, 1514, 1187,  527]]))

In [97]:
df1[1292:1293]

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,alltext
1323,Madagascar,indica,4.1,"Sleepy,Relaxed,Hungry,Happy,Euphoric","Earthy,Sweet,Skunk","Madagascar is an indica with a clean, floral s...","Sleepy,Relaxed,Hungry,Happy,Euphoric Earthy,Sw..."


In [98]:
df1['Description'][1323]

'Madagascar is an indica with a clean, floral smell. In true indica spirit, this strain produces a heavy-bodied, quick, and powerful sensation that is great for putting you to sleep in a hurry. Madagascar’s effects aren’t the longest-lasting but they hit quickly, making this strain great for those who just need a little help tackling insomnia.'