## Preprocessing

In [2]:
# import statements
import pandas as pd
import numpy as np

### Dataset 1:

In [19]:
data = pd.read_csv('cannabis.csv') # load 1st dataset
data.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 6 columns):
Strain         2351 non-null object
Type           2351 non-null object
Rating         2351 non-null float64
Effects        2351 non-null object
Flavor         2305 non-null object
Description    2318 non-null object
dtypes: float64(1), object(5)
memory usage: 110.3+ KB


We must note that "Flavor" and "Description" contain missing values, so we will replace them with empty strings ("").

In [21]:
data[['Flavor', 'Description']] = data[['Flavor', 'Description']].fillna("")

Ensure there are no duplicates.

In [22]:
data = data[data.duplicated() == False].reset_index(drop=True)

Let's create dummy variables for "Effects" and "Flavor" to make analysis easier. We need to first find how many unique effects are present across all strains.

In [23]:
effects_per_row = data['Effects'].apply(lambda x: np.array(x.split(','))).values
effects = set(np.concatenate(effects_per_row))
effects, len(effects)

({'Aroused',
  'Creative',
  'Dry',
  'Energetic',
  'Euphoric',
  'Focused',
  'Giggly',
  'Happy',
  'Hungry',
  'Mouth',
  'None',
  'Relaxed',
  'Sleepy',
  'Talkative',
  'Tingly',
  'Uplifted'},
 16)

This suggests that we must create 16 columns for each unique effect.

In [24]:
effects_dummy_data = pd.DataFrame({str('E.' + effect):[int(effect in row) for row in data['Effects'].values] for effect in effects})
effects_dummy_data.head()

Unnamed: 0,E.Talkative,E.Giggly,E.Tingly,E.Hungry,E.Energetic,E.Euphoric,E.Creative,E.Sleepy,E.Relaxed,E.Aroused,E.Uplifted,E.Dry,E.Mouth,E.Focused,E.None,E.Happy
0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1
2,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1
3,0,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0
4,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1


Repeat same procedure for "flavor".

In [25]:
flavors_per_row = data['Flavor'][data['Flavor'] != ""].apply(lambda x: np.array(x.split(','))).values
flavors = set(np.concatenate(flavors_per_row))
flavors, len(flavors)

({'Ammonia',
  'Apple',
  'Apricot',
  'Berry',
  'Blue',
  'Blueberry',
  'Butter',
  'Cheese',
  'Chemical',
  'Chestnut',
  'Citrus',
  'Coffee',
  'Diesel',
  'Earthy',
  'Flowery',
  'Fruit',
  'Grape',
  'Grapefruit',
  'Honey',
  'Lavender',
  'Lemon',
  'Lime',
  'Mango',
  'Menthol',
  'Mint',
  'Minty',
  'None',
  'Nutty',
  'Orange',
  'Peach',
  'Pear',
  'Pepper',
  'Pine',
  'Pineapple',
  'Plum',
  'Pungent',
  'Rose',
  'Sage',
  'Skunk',
  'Spicy/Herbal',
  'Strawberry',
  'Sweet',
  'Tar',
  'Tea',
  'Tobacco',
  'Tree',
  'Tropical',
  'Vanilla',
  'Violet',
  'Woody'},
 50)

This suggests that we must create 50 columns for each unique flavor.

In [26]:
flavors_dummy_data = pd.DataFrame({str('F.' + flavor):[int(flavor in row) for row in data['Flavor'].values] for flavor in flavors})
flavors_dummy_data.head()

Unnamed: 0,F.Pine,F.Pungent,F.Tropical,F.Sweet,F.Pear,F.Pineapple,F.Chestnut,F.Peach,F.Honey,F.Lime,...,F.Chemical,F.Orange,F.Rose,F.Earthy,F.Coffee,F.Tea,F.Lemon,F.Nutty,F.Citrus,F.Butter
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


### Dataset 2:

In [27]:
data1 = pd.read_csv('cannabis1.csv') # load 2nd dataset
data1.head()

Unnamed: 0,Strain,Type,CBD,THC,Daytime/Nighttime,Plant Length
0,Liberty-Haze,hybrid,0.00902,0.251632,DT,4.326288
1,Blueberry-Jack,hybrid,0.009004,0.237295,DT,4.434753
2,Og-Cheese,hybrid,0.009002,0.232326,DT,4.816522
3,Blueberry,indica,0.009993,0.214759,DT,6.964816
4,Kaboom,sativa,0.000542,0.266705,DT,12.68064


In [28]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28420 entries, 0 to 28419
Data columns (total 6 columns):
Strain               28420 non-null object
Type                 28420 non-null object
CBD                  28420 non-null float64
THC                  28420 non-null float64
Daytime/Nighttime    28420 non-null object
Plant Length         28420 non-null float64
dtypes: float64(3), object(3)
memory usage: 1.3+ MB


Ensure there are no duplicates.

In [29]:
data1 = data1[data1.duplicated() == False]

Now, we will group by cannabis strains and aggegrate "CBD" by average, "THC" by average, "Daytime/Nighttime" by mode, and "Plant Length" by average.

In [30]:
data1 = data1.groupby('Strain').agg({'CBD':'mean', 'THC':'mean', 'Daytime/Nighttime':lambda x: pd.Series.mode(x)[0], 'Plant Length':'mean'})
data1 = data1.reset_index()
data1.head()

Unnamed: 0,Strain,CBD,THC,Daytime/Nighttime,Plant Length
0,100-Og,0.009001,0.250741,DT,4.925945
1,1024,0.000508,0.288601,DT,12.266653
2,13-Dawgs,0.009002,0.245688,DT,4.755673
3,24K-Gold,0.008996,0.249672,DT,4.827362
4,3-Bears-Og,0.013001,0.212543,DT,4.613317


### Merge Data Frames:

Lastly, we will merge all data frames and export to csv.

In [31]:
merged_data = pd.concat([data[['Strain', 'Type', 'Rating', 'Description']], effects_dummy_data, flavors_dummy_data], axis=1)
merged_data = pd.merge(merged_data, data1)
cols = ['Strain', 'Type', 'Rating', 'CBD', 'THC', 'Plant Length', 'Daytime/Nighttime'] + list(merged_data.columns[4:-4]) + ['Description']
merged_data = merged_data[cols]
merged_data.head()


Unnamed: 0,Strain,Type,Rating,CBD,THC,Plant Length,Daytime/Nighttime,E.Talkative,E.Giggly,E.Tingly,...,F.Orange,F.Rose,F.Earthy,F.Coffee,F.Tea,F.Lemon,F.Nutty,F.Citrus,F.Butter,Description
0,100-Og,hybrid,4.0,0.009001,0.250741,4.925945,DT,0,0,1,...,0,0,1,0,0,0,0,1,0,$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,0.008998,0.245937,4.86069,DT,0,0,0,...,0,0,0,0,0,0,0,0,0,The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,0.000508,0.288601,12.266653,DT,0,0,0,...,0,0,0,0,0,0,0,0,0,1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,0.009002,0.245688,4.755673,DT,0,0,1,...,0,0,0,0,0,0,0,1,0,13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,0.008996,0.249672,4.827362,DT,1,0,0,...,1,0,1,0,0,0,0,1,0,"Also known as Kosher Tangie, 24k Gold is a 60%..."


Only uncomment to overwrite file in directory.

In [34]:
#merged_data.to_csv('cannabis_dummy.csv', index=False)
#effects_dummy_data.to_csv('effects_dummy.csv', index=False)
#flavors_dummy_data.to_csv('flavors_dummy.csv', index=False)