# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from category_encoders import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors

# Read in data

In [2]:
df = pd.read_csv("cannabis.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [3]:
df.drop([df.columns[0], df.columns[3]], axis=1, inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

Strain         0
Type           0
Effects        0
Flavor         0
Description    0
dtype: int64

In [6]:
df.head(1)

Unnamed: 0,Strain,Type,Effects,Flavor,Description
0,100-Og,hybrid,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...


In [7]:
x = set()
x.update([1,2,3,4])
x.update([5,6,7,8])
x

{1, 2, 3, 4, 5, 6, 7, 8}

In [8]:
effects = set()

for ls in df.Effects.str.split(','):
    effects.update(ls)
effects

{'Aroused',
 'Creative',
 'Dry Mouth',
 'Energetic',
 'Euphoric',
 'Focused',
 'Giggly',
 'Happy',
 'Hungry',
 'None',
 'Relaxed',
 'Sleepy',
 'Talkative',
 'Tingly',
 'Uplifted'}

In [9]:
flavors = set()

for ls in df.Flavor.str.split(','):
    flavors.update(ls)
#flavors

In [10]:
new_df = pd.DataFrame(data=np.zeros((df.shape[0], len(effects))),columns=effects)
new_df.head()

Unnamed: 0,Focused,Giggly,Sleepy,Uplifted,Relaxed,Happy,Talkative,Creative,Hungry,Energetic,None,Aroused,Euphoric,Dry Mouth,Tingly
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
new_df.isnull().sum()

Focused      0
Giggly       0
Sleepy       0
Uplifted     0
Relaxed      0
Happy        0
Talkative    0
Creative     0
Hungry       0
Energetic    0
None         0
Aroused      0
Euphoric     0
Dry Mouth    0
Tingly       0
dtype: int64

In [12]:
for i in range(df.shape[0]):
    for effect in effects:
        if effect in df.Effects.iloc[i]:
            new_df[effect][i] = 1

In [13]:
new_df.head()

Unnamed: 0,Focused,Giggly,Sleepy,Uplifted,Relaxed,Happy,Talkative,Creative,Hungry,Energetic,None,Aroused,Euphoric,Dry Mouth,Tingly
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
new_df.isnull().sum()

Focused      0
Giggly       0
Sleepy       0
Uplifted     0
Relaxed      0
Happy        0
Talkative    0
Creative     0
Hungry       0
Energetic    0
None         0
Aroused      0
Euphoric     0
Dry Mouth    0
Tingly       0
dtype: int64

In [15]:
flavor_df = pd.DataFrame(data=np.zeros((df.shape[0],len(flavors))), columns=flavors)
flavor_df.head(1)

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Apple,Tobacco,Honey,Chestnut,Mango,Menthol,Lemon,Tree,Grape,Lavender
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
flavor_df.isnull().sum()

Pepper          0
Spicy/Herbal    0
Pine            0
Grapefruit      0
Apricot         0
Peach           0
Mint            0
Tea             0
Tar             0
Cheese          0
Vanilla         0
None            0
Minty           0
Diesel          0
Woody           0
Citrus          0
Sage            0
Ammonia         0
Fruit           0
Violet          0
Skunk           0
Butter          0
Flowery         0
Blueberry       0
Rose            0
Pineapple       0
Pear            0
Lime            0
Strawberry      0
Coffee          0
Berry           0
Sweet           0
Earthy          0
Nutty           0
Blue            0
Chemical        0
Pungent         0
Orange          0
Plum            0
Tropical        0
Apple           0
Tobacco         0
Honey           0
Chestnut        0
Mango           0
Menthol         0
Lemon           0
Tree            0
Grape           0
Lavender        0
dtype: int64

In [17]:
for i in range(df.shape[0]):
    for flavor in flavors:
        if flavor in df.Flavor.iloc[i]:
            flavor_df[flavor][i] = 1

In [18]:
flavor_df.head()

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Apple,Tobacco,Honey,Chestnut,Mango,Menthol,Lemon,Tree,Grape,Lavender
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
flavor_df.isnull().sum()

Pepper          0
Spicy/Herbal    0
Pine            0
Grapefruit      0
Apricot         0
Peach           0
Mint            0
Tea             0
Tar             0
Cheese          0
Vanilla         0
None            0
Minty           0
Diesel          0
Woody           0
Citrus          0
Sage            0
Ammonia         0
Fruit           0
Violet          0
Skunk           0
Butter          0
Flowery         0
Blueberry       0
Rose            0
Pineapple       0
Pear            0
Lime            0
Strawberry      0
Coffee          0
Berry           0
Sweet           0
Earthy          0
Nutty           0
Blue            0
Chemical        0
Pungent         0
Orange          0
Plum            0
Tropical        0
Apple           0
Tobacco         0
Honey           0
Chestnut        0
Mango           0
Menthol         0
Lemon           0
Tree            0
Grape           0
Lavender        0
dtype: int64

In [35]:
f_e_df = pd.concat([flavor_df, new_df], axis=1)
f_e_df.head()

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Happy,Talkative,Creative,Hungry,Energetic,None,Aroused,Euphoric,Dry Mouth,Tingly
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [36]:
f_e_df.isnull().sum()

Pepper          0
Spicy/Herbal    0
Pine            0
Grapefruit      0
Apricot         0
               ..
None            0
Aroused         0
Euphoric        0
Dry Mouth       0
Tingly          0
Length: 65, dtype: int64

In [37]:
f_e_df.shape

(2277, 65)

In [38]:
f_e_df = pd.concat([f_e_df, df.Type, df.Description, df.Strain], axis=1)
f_e_df.head()

Unnamed: 0,Pepper,Spicy/Herbal,Pine,Grapefruit,Apricot,Peach,Mint,Tea,Tar,Cheese,...,Hungry,Energetic,None,Aroused,Euphoric,Dry Mouth,Tingly,Type,Description,Strain
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,hybrid,$100 OG is a 50/50 hybrid strain that packs a ...,100-Og
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,hybrid,The ‘98 Aloha White Widow is an especially pot...,98-White-Widow
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,sativa,1024 is a sativa-dominant hybrid bred in Spain...,1024
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,hybrid,13 Dawgs is a hybrid of G13 and Chemdawg genet...,13-Dawgs
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,hybrid,"Also known as Kosher Tangie, 24k Gold is a 60%...",24K-Gold


In [39]:
f_e_df.isnull().sum()

Pepper          72
Spicy/Herbal    72
Pine            72
Grapefruit      72
Apricot         72
                ..
Dry Mouth       72
Tingly          72
Type            72
Description     72
Strain          72
Length: 68, dtype: int64

In [40]:
assert len(df.Description) == f_e_df.shape[0]

AssertionError: 

In [42]:
f_e_df.dropna(inplace=True)

In [43]:
f_e_df.to_csv('psuedo_ohe.csv', index=None)

In [44]:
flavors

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Blue',
 'Blueberry',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Coffee',
 'Diesel',
 'Earthy',
 'Flowery',
 'Fruit',
 'Grape',
 'Grapefruit',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 'None',
 'Nutty',
 'Orange',
 'Peach',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy/Herbal',
 'Strawberry',
 'Sweet',
 'Tar',
 'Tea',
 'Tobacco',
 'Tree',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}