In [2]:
import pandas as pd


In [73]:
# CLEANING STEPS >>>> MAKE FUNCTION! [done]
def clean_csv(csv_url):
  df = pd.read_csv(csv_url)
  # typos
  df.Effects = df.Effects.str.replace('Uplifted,Happy,Energentic\n', 'Uplifted,Happy,Energetic\n')
  df.Effects = df.Effects.str.replace('Dry,Mouth', '') # an outlier, only occurs once
  df.Effects = df.Effects.str.replace('None', '') # replace None with ''
  df.Flavor = df.Flavor.str.replace('/', ',') # handling inconsistent separators
  df['Description'] = df['Description'].str.replace('\xa0','') # this is a weird thing, doesn't seem relevant, let's drop it

  # change columns ['Effects', 'Flavor'] dtype from strings to lists of strings
  df['Effects'] = df['Effects'].str.split(',')
  df['Flavor'] = df['Flavor'].str.split(',')

  # creating unique IDs
  #df = df.reset_index()
  return df

In [74]:
df = clean_csv('https://raw.githubusercontent.com/bw-med-cabinet-1/DS/master/data/Cannabis_Strains_Features.csv')
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"[Creative, Energetic, Tingly, Euphoric, Relaxed]","[Earthy, Sweet, Citrus]",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"[Relaxed, Aroused, Creative, Happy, Energetic]","[Flowery, Violet, Diesel]",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"[Uplifted, Happy, Relaxed, Energetic, Creative]","[Spicy, Herbal, Sage, Woody]",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"[Tingly, Creative, Hungry, Relaxed, Uplifted]","[Apricot, Citrus, Grapefruit]",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"[Happy, Relaxed, Euphoric, Uplifted, Talkative]","[Citrus, Earthy, Orange]","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [6]:
# TODO: 
# create individual features from effects lists -- nvm just encode later before model
# useful features: effects, type 
# could do NLP on description and encode
#   top-used terms that seem most predictive

# 

In [75]:
# creates corpus of unique effects to encode as features later
Effects_corpus = set()
frequency_dictionary = []
for ix, row in df.iterrows():
  if len(row.Effects) > 0:
    for effect in row.Effects:
      if effect:
        Effects_corpus.add(effect.strip())
        frequency_dictionary.append(effect.strip())

# seems clean enough - has a lot of interesting domain-specific terms
# maybe some of them could be useful for prediction
# here i'll just use the same technique to get a list of unique terms in description
#   aside from stopwords/domain redundancy, and obscure or rarely used words, 
#   the meat in the middle should give some useful features to encode from descr
import string, re
Descr_corpus = set()
Descr_frequency_dictionary = []
for ix, row in df.iterrows():
  Descr_list = re.sub('[\W_]', ' ', row.Description)
  Descr_list = Descr_list.split(' ')
  if len(Descr_list) > 0:
    for word in Descr_list:
      if word: # eliminate empty strings
        Descr_corpus.add(word.strip().lower())
        Descr_frequency_dictionary.append(word.strip().lower())


In [8]:
Effects_corpus

{'Aroused',
 'Creative',
 'Energetic',
 'Euphoric',
 'Focused',
 'Giggly',
 'Happy',
 'Hungry',
 'Relaxed',
 'Sleepy',
 'Talkative',
 'Tingly',
 'Uplifted'}

In [9]:
freq_dict = pd.Series(frequency_dictionary)

In [10]:
# Descr_frequency_dictionary = pd.Series(Descr_frequency_dictionary)

In [11]:
# sorted = Descr_frequency_dictionary.value_counts()

In [12]:
# can augment the usefulness of these features?
# for example, "happy" is common feature - indicates it treats depression,
# so user can specify "symptom" to treat depression and encode happy as 1
#
# relaxed - treats anxiety
# euphoric - treats depression, pain
# uplifted - treats depression, fatigue
# creative - treats depression
# sleepy - treats insomnia, anxiety
# energetic - treats fatigue, pain, depression
# focused - treats fatigue, depression, brain fog?
# hungry - treats loss of appetite, nausea
# talkative - treats anxiety?
# tingly - i dunno sounds like a side effect not a medical treatment
# giggly - treats depression? 
# aroused 
# dry mouth - side effect, but an outlier

In [13]:
# I want a medication that...

# TREATS:
# anxiety, depression, pain, fatigue, insomnia, 
# brain fog, loss of appetite, nausea, low libido

# and makes me:
# happy, relaxed, euphoric, uplifted, creative, sleepy, 
# energized, focused, hungry, talkative, tingly, giggly

# optionally...
# I do NOT want to feel
# happy, relaxed, euphoric, uplifted, creative, sleepy, 
# energized, focused, hungry, talkative, tingly, giggly

In [14]:
# OPTIONAL: limit my results to
# indica, sativa, hybrid (can choose all or none)
# 2 or more stars, 3 or more stars, or 4 or more stars (one of these or none)

In [15]:
# USEFUL KEYWORDS FOUND IN DESCRIPTION
# body, potent, stress, relaxing, cerebral, mind, physical, uplifting, 
# relaxation, day, cbd, euphoria, anxiety, relief, mood, appetite, mental, 
# depression, energy, balanced, nausea, creative, insomnia, alien, good, help, 
# stimulating

# OK modeling cells to import to other nb

In [76]:
# this cell creates features from some keywords determined by frequency/relevance
kw = ['body', 'potent', 'stress', 'relaxing', 'cerebral', 'mind', 'physical', 
      'uplifting', 'relaxation', 'day', 'cbd', 'euphoria', 'anxiety', 'relief',
      'mood', 'appetite', 'mental', 'depression', 'energy', 'balanced', 
      'nausea', 'creative', 'insomnia', 'alien', 'good', 'help', 'stimulating']
# these lines create one hot for every keyword
for keyword in kw:
  df[keyword]=df.Description.str.contains(keyword)

In [77]:
df['anxiety'] = df.Effects.str.join(' ').str.contains('Relaxed'or'Sleepy' or'Talkative')
df['depression'] = df.Effects.str.join(' ').str.contains('euphoric'or'uplifted'or'creative'or
                                           'energetic'or'focused'or'giggly')
df['pain'] = df.Effects.str.join(' ').str.contains('euphoric'or'energetic')
df['fatigue'] = df.Effects.str.join(' ').str.contains('uplifted'or'energetic'or'focused')
df['insomnia'] = df.Effects.str.join(' ').str.contains('sleepy')
df['brain fog'] = df.Effects.str.join(' ').str.contains('focused'or'creative')
df['loss of appetite'] = df.Effects.str.join(' ').str.contains('hungry')
df['nausea'] = df.Effects.str.join(' ').str.contains('hungry')
df['low libido'] = df.Effects.str.join(' ').str.contains('aroused')

In [78]:
df.anxiety.value_counts()

True     1760
False     591
Name: anxiety, dtype: int64

In [79]:
unique_types = df.Type.unique()
for straintype in unique_types:
  df[straintype]=df.Type.str.contains(straintype)

In [80]:
# cell creates one hot for every effect
for effect in Effects_corpus:
  for ix, row in df.iterrows():
    df[effect] = effect in row.Effects

In [81]:
# creates list of possible flavors
Flavor_corpus = set()
for ix, strain in df.iterrows():
  if len(strain.Flavor) > 0:
    for flavor in strain.Flavor:
      if flavor and flavor != "None":
        Flavor_corpus.add(flavor.strip())
Flavor_corpus

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Bluberry',
 'Blue',
 'Blueberry',
 'Bubblegum',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Coffee',
 'Diesel',
 'Earthy',
 'Flowery',
 'Fruit',
 'Fruity',
 'Grape',
 'Grapefruit',
 'Grapes',
 'Herbal',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 'Nutty',
 'Orange',
 'Peach',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy',
 'Strawberry',
 'Sweet',
 'Tar',
 'Tea',
 'Tobacco',
 'Tree',
 'Tree Fruit',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}

In [82]:
# cell adds one hot for every flavor
for flavor in Flavor_corpus:
  for ix, row in df.iterrows():
    df[flavor] = flavor in row.Flavor


In [83]:
df.anxiety.value_counts()

True     1760
False     591
Name: anxiety, dtype: int64

In [135]:
df.replace(False, 0, inplace=True)

In [136]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,body,potent,stress,relaxing,cerebral,mind,physical,uplifting,relaxation,day,cbd,euphoria,anxiety,relief,mood,appetite,mental,depression,energy,balanced,nausea,creative,insomnia,alien,good,help,stimulating,pain,fatigue,brain fog,loss of appetite,low libido,hybrid,sativa,...,Pepper,Fruit,Apricot,Mango,Tea,Vanilla,Berry,Strawberry,Menthol,Blue,Honey,Blueberry,Minty,Pine,Lavender,Flowery,Orange,Nutty,Grapes,Woody,Tropical,Peach,Grape,Diesel,Spicy,Mint,Sweet,Coffee,Chemical,Cheese,Tar,Ammonia,Bubblegum,Pineapple,Lemon,Plum,Earthy,Violet,Pungent,Lime
0,100-Og,hybrid,4.0,"[Creative, Energetic, Tingly, Euphoric, Relaxed]","[Earthy, Sweet, Citrus]",$100 OG is a 50/50 hybrid strain that packs a ...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,True,0.0
1,98-White-Widow,hybrid,4.7,"[Relaxed, Aroused, Creative, Happy, Energetic]","[Flowery, Violet, Diesel]",The ‘98 Aloha White Widow is an especially pot...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,True,0.0
2,1024,sativa,4.4,"[Uplifted, Happy, Relaxed, Energetic, Creative]","[Spicy, Herbal, Sage, Woody]",1024 is a sativa-dominant hybrid bred in Spain...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,True,0.0
3,13-Dawgs,hybrid,4.2,"[Tingly, Creative, Hungry, Relaxed, Uplifted]","[Apricot, Citrus, Grapefruit]",13 Dawgs is a hybrid of G13 and Chemdawg genet...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,True,0.0
4,24K-Gold,hybrid,4.6,"[Happy, Relaxed, Euphoric, Uplifted, Talkative]","[Citrus, Earthy, Orange]","Also known as Kosher Tangie, 24k Gold is a 60%...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0,True,0.0


In [137]:
# high cardinality/useless for prediction:
# [Strain, Rating, Effects, Flavor, Description]
# useful:
# all one-hot, plus type

df_train = df.drop(columns={'Strain', 'Rating', 'Effects', 'Flavor', 'Description', 'Type'})

In [138]:
df_train.shape

(2351, 103)

In [139]:
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors(n_neighbors=5)

nn_model.fit(df_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [140]:
example = df_train.iloc[[0]]
#example = example.to_dict()
example.shape

(1, 103)

In [141]:
#example.to_json(orient='records')

In [142]:
nn_model.kneighbors(example)

(array([[0., 0., 1., 1., 1.]]), array([[   0,  213, 1730, 1226, 1248]]))

In [181]:
example_results = nn_model.kneighbors(example)
similarity = list(example_results[0][0])
idx = list(example_results[1][0])
results = dict(zip(idx, similarity))
results

{0: 0.0, 213: 0.0, 1226: 1.0, 1248: 1.0, 1730: 1.0}

In [144]:
recs = [df.iloc[result] for result in example_results]
recs

[Strain                                               100-Og
 Type                                                 hybrid
 Rating                                                    4
 Effects    [Creative, Energetic, Tingly, Euphoric, Relaxed]
 Flavor                              [Earthy, Sweet, Citrus]
                                  ...                       
 Plum                                                      0
 Earthy                                                 True
 Violet                                                    0
 Pungent                                                True
 Lime                                                      0
 Name: 0, Length: 109, dtype: object,
 Strain                                        Bio-Jesus
 Type                                             hybrid
 Rating                                              4.6
 Effects    [Happy, Relaxed, Euphoric, Sleepy, Uplifted]
 Flavor                       [Earthy, Chemical, Diesel]
      

In [145]:
from joblib import dump, load
dump(nn_model, 'nn_model.joblib', compress=True)

['nn_model.joblib']

In [146]:
!ls -lh nn_model.joblib

-rw-r--r-- 1 root root 88K Sep 22 23:35 nn_model.joblib


In [147]:
df_train.columns

Index(['body', 'potent', 'stress', 'relaxing', 'cerebral', 'mind', 'physical',
       'uplifting', 'relaxation', 'day',
       ...
       'Tar', 'Ammonia', 'Bubblegum', 'Pineapple', 'Lemon', 'Plum', 'Earthy',
       'Violet', 'Pungent', 'Lime'],
      dtype='object', length=103)

In [148]:
unique_types

array(['hybrid', 'sativa', 'indica'], dtype=object)

In [149]:
Effects_corpus

{'Aroused',
 'Creative',
 'Energetic',
 'Euphoric',
 'Focused',
 'Giggly',
 'Happy',
 'Hungry',
 'Relaxed',
 'Sleepy',
 'Talkative',
 'Tingly',
 'Uplifted'}

In [150]:
conditions = ['anxiety', 'depression', 'pain', 'fatigue', 'insomnia',
              'brain fog', 'loss of appetite', 'nausea', 'low libido']

In [151]:
conditions

['anxiety',
 'depression',
 'pain',
 'fatigue',
 'insomnia',
 'brain fog',
 'loss of appetite',
 'nausea',
 'low libido']

In [158]:
for column in df_train.columns:
  print(f'"{column}",')

"body",
"potent",
"stress",
"relaxing",
"cerebral",
"mind",
"physical",
"uplifting",
"relaxation",
"day",
"cbd",
"euphoria",
"anxiety",
"relief",
"mood",
"appetite",
"mental",
"depression",
"energy",
"balanced",
"nausea",
"creative",
"insomnia",
"alien",
"good",
"help",
"stimulating",
"pain",
"fatigue",
"brain fog",
"loss of appetite",
"low libido",
"hybrid",
"sativa",
"indica",
"Focused",
"Happy",
"Aroused",
"Uplifted",
"Creative",
"Hungry",
"Sleepy",
"Giggly",
"Relaxed",
"Tingly",
"Energetic",
"Euphoric",
"Talkative",
"Grapefruit",
"Pear",
"Tree",
"Tobacco",
"Apple",
"Herbal",
"Citrus",
"Sage",
"Butter",
"Bluberry",
"Fruity",
"Tree Fruit",
"Rose",
"Chestnut",
"Skunk",
"Pepper",
"Fruit",
"Apricot",
"Mango",
"Tea",
"Vanilla",
"Berry",
"Strawberry",
"Menthol",
"Blue",
"Honey",
"Blueberry",
"Minty",
"Pine",
"Lavender",
"Flowery",
"Orange",
"Nutty",
"Grapes",
"Woody",
"Tropical",
"Peach",
"Grape",
"Diesel",
"Spicy",
"Mint",
"Sweet",
"Coffee",
"Chemical",
"Cheese",
"Tar",
"Ammonia",
"Bubbl