In [1]:
# import statements
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import squarify

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_md")

In [2]:
df = pd.read_csv('./final_df_percentsep.csv', sep='%', index_col='Unnamed: 0')

In [3]:
df.head()

Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description
1,Afpak,"Earthy, Chemical, Pine, SpicyHerbal",hybrid,"Relaxed, Hungry, Happy, Sleepy, Creative, Focused",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...",4.2,Afpak named for its direct Afghani and Pakista...
2,African,"SpicyHerbal, Pungent, Earthy, Pepper",sativa,"Euphoric, Happy, Creative, Energetic, Talkativ...",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...",3.9,African refers to the indigenous varieties of ...
3,Afternoon Delight,"Pepper, Flowery, Pine, Pungent, Citrus, Tropical",hybrid,"Relaxed, Hungry, Euphoric, Uplifted, Tingly, T...","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...",4.8,Afternoon Delight created by Colorado Seed Inc...
4,Afwreck,"Pine, Earthy, Flowery, Pungent",hybrid,"Relaxed, Happy, Creative, Uplifted, Sleepy, Eu...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...",4.2,Afwreck is a hybrid cross of Afghani and Train...
5,Agent Orange,"Citrus, Orange, Sweet, Earthy",hybrid,"Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...",4.2,Don’t let the name scare you! The only herbici...


In [6]:
df.isnull().sum()

name                  0
flavors               0
race                  0
positive_effects      0
negative_effects    263
medical_uses          0
Rating                0
Description          22
dtype: int64

In [7]:
df = df.fillna('none')

In [8]:
df.isnull().sum()

name                0
flavors             0
race                0
positive_effects    0
negative_effects    0
medical_uses        0
Rating              0
Description         0
dtype: int64

In [9]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [10]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [11]:
# combine all text features into one string:

df['combined_text'] = df.name + " " + df.flavors +  " " + df.race + " " + df.positive_effects + " " + df.negative_effects + " " + df.medical_uses + " " + df.Description
# Removing punctuations from our string
df["combined_text"] = df['combined_text'].str.replace('[^\w\s]',' ')

In [12]:
df.head()

Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description,combined_text
1,Afpak,"Earthy, Chemical, Pine, SpicyHerbal",hybrid,"Relaxed, Hungry, Happy, Sleepy, Creative, Focused",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...",4.2,Afpak named for its direct Afghani and Pakista...,Afpak Earthy Chemical Pine SpicyHerbal hybr...
2,African,"SpicyHerbal, Pungent, Earthy, Pepper",sativa,"Euphoric, Happy, Creative, Energetic, Talkativ...",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...",3.9,African refers to the indigenous varieties of ...,African SpicyHerbal Pungent Earthy Pepper s...
3,Afternoon Delight,"Pepper, Flowery, Pine, Pungent, Citrus, Tropical",hybrid,"Relaxed, Hungry, Euphoric, Uplifted, Tingly, T...","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...",4.8,Afternoon Delight created by Colorado Seed Inc...,Afternoon Delight Pepper Flowery Pine Punge...
4,Afwreck,"Pine, Earthy, Flowery, Pungent",hybrid,"Relaxed, Happy, Creative, Uplifted, Sleepy, Eu...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...",4.2,Afwreck is a hybrid cross of Afghani and Train...,Afwreck Pine Earthy Flowery Pungent hybrid ...
5,Agent Orange,"Citrus, Orange, Sweet, Earthy",hybrid,"Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...",4.2,Don’t let the name scare you! The only herbici...,Agent Orange Citrus Orange Sweet Earthy hyb...


In [15]:
df.iloc[259]

id                                                                325
name                                                     Cherry Skunk
flavors                                   Sweet, Berry, Skunk, Earthy
race                                                           hybrid
positive_effects    Relaxed, Euphoric, Creative, Uplifted, Focused...
negative_effects                           Dizzy, Dry Mouth, Dry Eyes
medical_uses        Depression, Pain, Stress, Lack of Appetite, He...
Rating                                                            4.4
Description         Cherry Skunk is an indica-dominant hybrid that...
combined_text       Cherry Skunk Sweet  Berry  Skunk  Earthy hybri...
Name: 325, dtype: object

In [16]:
def get_lemmas2(text):

    nlp = spacy.load("en_core_web_md")

    tokenizer = Tokenizer(nlp.vocab)

    STOP_WORDS = nlp.Defaults.stop_words.union(['  ', '-PRON-'])

    lemmas = []

    doc = nlp(text)


    for token in doc:
        lemmas.append(token.lemma_)

    lemma_summary = []


    working_set = ""
    for lemma in lemmas:
        working_set += lemma + ' '
    lemma_summary.append(working_set)


    description = [lemma_summary[0]]

    tokens = []

    for doc in tokenizer.pipe(description, batch_size=500):

        doc_tokens = []

        for token in doc:
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
                if token.text.lower() not in STOP_WORDS:
                    doc_tokens.append(token.text.lower())

        tokens.append(doc_tokens)

    token_summary = []

    for set_of_tokens in tokens:
        working_set = ""
        for variable in set_of_tokens:
            working_set += variable + ' '
        token_summary.append(working_set)

    return token_summary[0]

In [None]:
df['new_lemmas'] = df['combined_text'].apply(get_lemmas2)

In [10]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [11]:
df['all_lemmas'] = df['combined_text'].apply(get_lemmas)

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description,combined_text,all_lemmas
0,1,Afpak,"Earthy, Chemical, Pine, SpicyHerbal",hybrid,"Relaxed, Hungry, Happy, Sleepy, Creative, Focused",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...",4.2,Afpak named for its direct Afghani and Pakista...,Afpak Earthy Chemical Pine SpicyHerbal hybr...,"[Afpak, Earthy, , Chemical, , Pine, , Spicy..."
1,2,African,"SpicyHerbal, Pungent, Earthy, Pepper",sativa,"Euphoric, Happy, Creative, Energetic, Talkativ...",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...",3.9,African refers to the indigenous varieties of ...,African SpicyHerbal Pungent Earthy Pepper s...,"[African, SpicyHerbal, , pungent, , Earthy, ..."
2,3,Afternoon Delight,"Pepper, Flowery, Pine, Pungent, Citrus, Tropical",hybrid,"Relaxed, Hungry, Euphoric, Uplifted, Tingly, T...","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...",4.8,Afternoon Delight created by Colorado Seed Inc...,Afternoon Delight Pepper Flowery Pine Punge...,"[afternoon, Delight, Pepper, , flowery, , Pi..."
3,4,Afwreck,"Pine, Earthy, Flowery, Pungent",hybrid,"Relaxed, Happy, Creative, Uplifted, Sleepy, Eu...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...",4.2,Afwreck is a hybrid cross of Afghani and Train...,Afwreck Pine Earthy Flowery Pungent hybrid ...,"[Afwreck, Pine, , Earthy, , Flowery, , pung..."
4,5,Agent Orange,"Citrus, Orange, Sweet, Earthy",hybrid,"Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...",4.2,Don’t let the name scare you! The only herbici...,Agent Orange Citrus Orange Sweet Earthy hyb...,"[Agent, Orange, Citrus, , Orange, , sweet, ..."


In [14]:
tfidf = TfidfVectorizer(stop_words='english')

In [15]:
dtm = tfidf.fit_transform(df['combined_text'])

In [16]:
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [17]:
dtm.head()

Unnamed: 0,11,12,13,1974,43,44,47,51,69,91,...,zeta,zingers,zion,zipping,zkittlez,zombie,zombies,zone,zoning,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [19]:
test_input = ["Looking for something to help with depression and insomnia"]
user_input = tfidf.transform(test_input)
score, strain_index = nn.kneighbors(user_input.todense())

In [20]:
print(strain_index)

[[1309 1232   20  503 1261]]


In [21]:
df.loc[1309]

name                                                   Tangerine Haze
flavors                                         Citrus, Orange, Sweet
race                                                           hybrid
positive_effects    Euphoric, Happy, Creative, Energetic, Uplifted...
negative_effects                  Dizzy, Dry Mouth, Dry Eyes, Anxious
medical_uses        Depression, Pain, Stress, Headache, Fatigue, E...
Rating                                                            4.5
Description         Tangerine Haze is a sativa-dominant hybrid tha...
combined_text       Tangerine Haze Citrus  Orange  Sweet hybrid Eu...
Name: 1309, dtype: object

In [33]:
df[df['name'] == "Pure Power Plant"]

Unnamed: 0,name,flavors,race,positive_effects,negative_effects,medical_uses,Rating,Description,combined_text
1060,Pure Power Plant,"Citrus, Earthy, Pungent, Pine",hybrid,"Relaxed, Happy, Creative, Energetic, Focused, ...","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Headache, Fatigue, H...",4.4,If you’re searching for a potent sativa Pure P...,Pure Power Plant Citrus Earthy Pungent Pine...


In [22]:
strains = [df[['name', 'medical_uses']].loc[n] for n in strain_index]

In [23]:
print(strains)

[                     name                                       medical_uses
1309       Tangerine Haze  Depression, Pain, Stress, Headache, Fatigue, E...
1232     Strawberry Cough  Depression, Pain, Stress, Lack of Appetite, Fa...
20    Alien Hallucination  Depression, Pain, Stress, Headache, Fatigue, M...
503          Eugene Cream         Depression, Insomnia, Pain, Stress, Cramps
1261     Super Blue Dream  Depression, Stress, Lack of Appetite, Nausea, ...]


In [24]:
from sklearn.externals import joblib 
joblib.dump(nn, 'baseline.pkl')

['baseline.pkl']

In [25]:
joblib.dump(tfidf, "tfidf.pkl")

['tfidf.pkl']