# Med Cab Unit 4 Notebook
Notebook for Data analysis for DS17 Unit 4 Build week

In [25]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import en_core_web_sm
import spacy
from spacy import load

In [2]:
url = 'https://raw.githubusercontent.com/build-week-ft-med-cabinet-3/datascience/master/data/toking.csv'

df = pd.read_csv(url)

df.head()

Unnamed: 0,name,type,flavors,positive_effects,negative_effects,ailment,search
0,Afpak,hybrid,"Earthy, Chemical, Pine","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Afpak,Relaxed, Hungry, Happy, Sleepy,Earthy, C..."
1,African,sativa,"Spicy/Herbal, Pungent, Earthy","Euphoric, Happy, Creative, Energetic, Talkative",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...","African,Euphoric, Happy, Creative, Energetic, ..."
2,Afternoon Delight,hybrid,"Pepper, Flowery, Pine","Relaxed, Hungry, Euphoric, Uplifted, Tingly","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...","Afternoon Delight,Relaxed, Hungry, Euphoric, U..."
3,Afwreck,hybrid,"Pine, Earthy, Flowery","Relaxed, Happy, Creative, Uplifted, Sleepy","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...","Afwreck,Relaxed, Happy, Creative, Uplifted, Sl..."
4,Agent Orange,hybrid,"Citrus, Orange, Sweet","Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...","Agent Orange,Relaxed, Euphoric, Happy, Energet..."


In [3]:
df.shape

(1970, 7)

### List of TODOs:
- Tokenze relevant columns
  - Are we just using ailments?  Or do we also want to include Positive effects?  Negative?
- Nearest Neighbors model
  - Initially will only play with ailments
  - Explore the `search` column


In [4]:
df['ailment'] = df['ailment'].astype(str)

In [5]:
for tokens in df['ailment']:
  if type(tokens) == float:
    print(tokens)

In [6]:
# Tokenize columns
nlp= en_core_web_sm.load()

In [7]:
def tokenizer(text):
    doc=nlp(str(text).lower())
    return [token.lemma_ for token in doc if ((token.is_stop == False) and
    (token.is_punct == False)) and (token.pos_ != 'PRON')]

In [8]:
df['tokens'] = df['ailment'].apply(tokenizer)

In [9]:
df.head()

Unnamed: 0,name,type,flavors,positive_effects,negative_effects,ailment,search,tokens
0,Afpak,hybrid,"Earthy, Chemical, Pine","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Afpak,Relaxed, Hungry, Happy, Sleepy,Earthy, C...","[depression, insomnia, pain, stress, lack, app..."
1,African,sativa,"Spicy/Herbal, Pungent, Earthy","Euphoric, Happy, Creative, Energetic, Talkative",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...","African,Euphoric, Happy, Creative, Energetic, ...","[depression, pain, stress, lack, appetite, nau..."
2,Afternoon Delight,hybrid,"Pepper, Flowery, Pine","Relaxed, Hungry, Euphoric, Uplifted, Tingly","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...","Afternoon Delight,Relaxed, Hungry, Euphoric, U...","[depression, insomnia, pain, stress, cramp, he..."
3,Afwreck,hybrid,"Pine, Earthy, Flowery","Relaxed, Happy, Creative, Uplifted, Sleepy","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...","Afwreck,Relaxed, Happy, Creative, Uplifted, Sl...","[pain, stress, headache, fatigue, headache, mu..."
4,Agent Orange,hybrid,"Citrus, Orange, Sweet","Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...","Agent Orange,Relaxed, Euphoric, Happy, Energet...","[depression, pain, stress, nausea, headache, h..."


In [10]:
df.dropna(inplace=True)

In [11]:
tokenset = set()

for tokens in df['tokens']:
  for token in tokens:
    tokenset.add(token)

In [12]:
tokenset

{'appetite',
 'cramp',
 'depression',
 'eye',
 'fatigue',
 'headache',
 'inflammation',
 'insomnia',
 'lack',
 'muscle',
 'nan',
 'nausea',
 'pain',
 'pressure',
 'seizure',
 'spasm',
 'spasms',
 'spasticity',
 'stress'}

### Combining datasets


In [13]:
url2 = 'https://raw.githubusercontent.com/med-cab-1/data_engineer/master/Data/cannabis_new.csv'

df2 = pd.read_csv(url2)
df2.head()

Unnamed: 0,Index,Strain,Type,Rating,Effects,Description,Flavors,Nearest
0,0,Kelly Hill Gold,indica,5.0,"Happy,Energetic,Euphoric,Talkative,Aroused",Cultivated by Joseph Arthur Botanicals in Colo...,"Pepper,Earthy,Coffee",9928974391841877
1,1,Spyder Mon,hybrid,5.0,"Uplifted,Creative,Focused,Happy,Relaxed",Spyder Mon is an uplifting CBD strain with a g...,"Citrus,Earthy,Sweet",12181571627223742
2,2,Mochi,hybrid,5.0,"Sleepy,Happy,Hungry,Relaxed,Tingly",Mochi by Sherbinski is another strain that lea...,"Pungent,Minty,Flowery",2614457208705130
3,3,Molokai Purpz,indica,5.0,"Aroused,Creative,Euphoric,Relaxed,Sleepy",Moloka’i Purpz is a luscious Hawaiian landrace...,"Berry,Grape,Sweet",31478562106350390
4,4,Monolith,indica,5.0,"Relaxed,Sleepy,Tingly,Euphoric,Focused",Monolith is an indica-dominant strain with Afg...,"Pungent,Earthy,Pine",413978621094214141


In [14]:
df.head()

Unnamed: 0,name,type,flavors,positive_effects,negative_effects,ailment,search,tokens
0,Afpak,hybrid,"Earthy, Chemical, Pine","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Afpak,Relaxed, Hungry, Happy, Sleepy,Earthy, C...","[depression, insomnia, pain, stress, lack, app..."
1,African,sativa,"Spicy/Herbal, Pungent, Earthy","Euphoric, Happy, Creative, Energetic, Talkative",Dry Mouth,"Depression, Pain, Stress, Lack of Appetite, Na...","African,Euphoric, Happy, Creative, Energetic, ...","[depression, pain, stress, lack, appetite, nau..."
2,Afternoon Delight,hybrid,"Pepper, Flowery, Pine","Relaxed, Hungry, Euphoric, Uplifted, Tingly","Dizzy, Dry Mouth, Paranoid","Depression, Insomnia, Pain, Stress, Cramps, He...","Afternoon Delight,Relaxed, Hungry, Euphoric, U...","[depression, insomnia, pain, stress, cramp, he..."
3,Afwreck,hybrid,"Pine, Earthy, Flowery","Relaxed, Happy, Creative, Uplifted, Sleepy","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Pain, Stress, Headache, Fatigue, Headaches, Mu...","Afwreck,Relaxed, Happy, Creative, Uplifted, Sl...","[pain, stress, headache, fatigue, headache, mu..."
4,Agent Orange,hybrid,"Citrus, Orange, Sweet","Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes","Depression, Pain, Stress, Nausea, Headache, He...","Agent Orange,Relaxed, Euphoric, Happy, Energet...","[depression, pain, stress, nausea, headache, h..."


In [15]:
df2['name'] = df2['Strain']

In [16]:
df2.reset_index()

full_df = df.merge(df2, on='name', how='left')

full_df.head(1)

Unnamed: 0,name,type,flavors,positive_effects,negative_effects,ailment,search,tokens,Index,Strain,Type,Rating,Effects,Description,Flavors,Nearest
0,Afpak,hybrid,"Earthy, Chemical, Pine","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Afpak,Relaxed, Hungry, Happy, Sleepy,Earthy, C...","[depression, insomnia, pain, stress, lack, app...",1667.0,Afpak,hybrid,3.18,"Relaxed,Creative,Focused,Sleepy,Happy","Afpak, named for its direct Afghani and Pakist...","Pine,Spicy/Herbal,Earthy",1667428378100810521968


In [17]:
full_df.shape

(1968, 16)

In [21]:
df = full_df[['name', 'Rating', 'type', 'ailment', 'positive_effects', 'negative_effects', 'Description', 'Effects', 'flavors']]

df.head()

Unnamed: 0,name,Rating,type,ailment,positive_effects,negative_effects,Description,Effects,flavors
0,Afpak,3.18,hybrid,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Afpak, named for its direct Afghani and Pakist...","Relaxed,Creative,Focused,Sleepy,Happy","Earthy, Chemical, Pine"
1,African,2.5,sativa,"Depression, Pain, Stress, Lack of Appetite, Na...","Euphoric, Happy, Creative, Energetic, Talkative",Dry Mouth,African refers to the indigenous varieties of ...,"Euphoric,Energetic,Aroused,Tingly,Creative","Spicy/Herbal, Pungent, Earthy"
2,Afternoon Delight,4.55,hybrid,"Depression, Insomnia, Pain, Stress, Cramps, He...","Relaxed, Hungry, Euphoric, Uplifted, Tingly","Dizzy, Dry Mouth, Paranoid","Afternoon Delight, created by Colorado Seed In...","Talkative,Relaxed,Uplifted,Tingly,Creative","Pepper, Flowery, Pine"
3,Afwreck,3.18,hybrid,"Pain, Stress, Headache, Fatigue, Headaches, Mu...","Relaxed, Happy, Creative, Uplifted, Sleepy","Dizzy, Dry Mouth, Paranoid, Dry Eyes",Afwreck is a hybrid cross of Afghani and Train...,"Euphoric,Happy,Uplifted,Relaxed,Sleepy","Pine, Earthy, Flowery"
4,Agent Orange,3.18,hybrid,"Depression, Pain, Stress, Nausea, Headache, He...","Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes",Don’t let the name scare you! The only herbici...,"Happy,Uplifted,Relaxed,Energetic,Euphoric","Citrus, Orange, Sweet"


In [22]:
df['strain_id'] = df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
df.head()

Unnamed: 0,name,Rating,type,ailment,positive_effects,negative_effects,Description,Effects,flavors,strain_id
0,Afpak,3.18,hybrid,"Depression, Insomnia, Pain, Stress, Lack of Ap...","Relaxed, Hungry, Happy, Sleepy",Dizzy,"Afpak, named for its direct Afghani and Pakist...","Relaxed,Creative,Focused,Sleepy,Happy","Earthy, Chemical, Pine",0
1,African,2.5,sativa,"Depression, Pain, Stress, Lack of Appetite, Na...","Euphoric, Happy, Creative, Energetic, Talkative",Dry Mouth,African refers to the indigenous varieties of ...,"Euphoric,Energetic,Aroused,Tingly,Creative","Spicy/Herbal, Pungent, Earthy",1
2,Afternoon Delight,4.55,hybrid,"Depression, Insomnia, Pain, Stress, Cramps, He...","Relaxed, Hungry, Euphoric, Uplifted, Tingly","Dizzy, Dry Mouth, Paranoid","Afternoon Delight, created by Colorado Seed In...","Talkative,Relaxed,Uplifted,Tingly,Creative","Pepper, Flowery, Pine",2
3,Afwreck,3.18,hybrid,"Pain, Stress, Headache, Fatigue, Headaches, Mu...","Relaxed, Happy, Creative, Uplifted, Sleepy","Dizzy, Dry Mouth, Paranoid, Dry Eyes",Afwreck is a hybrid cross of Afghani and Train...,"Euphoric,Happy,Uplifted,Relaxed,Sleepy","Pine, Earthy, Flowery",3
4,Agent Orange,3.18,hybrid,"Depression, Pain, Stress, Nausea, Headache, He...","Relaxed, Euphoric, Happy, Energetic, Uplifted","Dizzy, Dry Mouth, Paranoid, Dry Eyes",Don’t let the name scare you! The only herbici...,"Happy,Uplifted,Relaxed,Energetic,Euphoric","Citrus, Orange, Sweet",4


In [24]:
df.shape

(1968, 10)

In [26]:
df.isnull().sum()

name                  0
Rating              497
type                  0
ailment               0
positive_effects      0
negative_effects      0
Description         497
Effects             497
flavors               0
strain_id             0
dtype: int64

In [27]:
df = df.replace({'nan': np.NaN})

In [28]:
df.isnull().sum()

name                  0
Rating              497
type                  0
ailment              54
positive_effects      0
negative_effects      0
Description         497
Effects             497
flavors               0
strain_id             0
dtype: int64

In [31]:
df = df.dropna()

In [32]:
df.shape

(1439, 10)

In [33]:
df.isnull().sum()

name                0
Rating              0
type                0
ailment             0
positive_effects    0
negative_effects    0
Description         0
Effects             0
flavors             0
strain_id           0
dtype: int64

In [39]:
df.to_csv('cannabis_final.csv')

In [40]:
weed = pd.read_csv('weed_data.csv')

In [41]:
weed.head()

Unnamed: 0.1,Unnamed: 0,name,rating,effects,description
0,0,$100 OG,2.9,"['Euphoric', ' Creative ', ' Happy ']",$100 OG is a hybrid cannabis strain that origi...
1,1,'98 Aloha White Widow,4.8,"['Relaxed', ' Happy ', ' Euphoric ']",The ‘98 Aloha White Widow is an especially pot...
2,2,1024,4.5,"['Happy', ' Relaxed ', ' Uplifted ']",1024 is a sativa-dominant hybrid bred in Spain...
3,3,10th Planet,5.0,[''],Ethos Genetics crossed Planet of the Grapes an...
4,4,12 Year OG,3.0,"['Happy', ' Relaxed ', ' Creative ']",The Bank Cannabis Genetics crossed some unknow...


In [42]:
weed.shape

(3449, 5)