In [1]:
import numpy as np
import pandas as pd

import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import joblib

[38;5;2mâœ” Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Model 1 - Type of Strain

In [2]:
dfnlp = pd.read_csv('merged.csv')
dfnlp['Effects'] = dfnlp['Effects'].str.replace(',', ' ')
df = dfnlp[['Effects' ,'Type']]
df.head()

Unnamed: 0,Effects,Type
0,Creative Energetic Tingly Euphoric Relaxed,hybrid
1,Uplifted Happy Relaxed Energetic Creative,sativa
2,Tingly Creative Hungry Relaxed Uplifted,hybrid
3,Happy Relaxed Euphoric Uplifted Talkative,hybrid
4,Relaxed Euphoric Happy Uplifted Hungry,hybrid


In [3]:
nlp = en_core_web_lg.load()

In [4]:
import re

def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub('[^a-zA-Z 0-9]', ' ', text)
    tokens = tokens.lower().split()
    
    return tokens

df['base_tokens'] = df['Effects'].apply(tokenize)
df['base_tokens'][0:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0    [creative, energetic, tingly, euphoric, relaxed]
1     [uplifted, happy, relaxed, energetic, creative]
2       [tingly, creative, hungry, relaxed, uplifted]
Name: base_tokens, dtype: object

In [5]:
doc = nlp(df['base_tokens'].to_string())
nlp_vector = doc.vector

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['Effects'], 
                                                    df['Type'], 
                                                    test_size=0.2, 
                                                    stratify=df['Type'],
                                                    random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1412,) (354,) (1412,) (354,)


In [7]:
def get_tokens(document):
    doc = nlp(document)
    return [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [8]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), tokenizer=get_tokens)
rfc = RandomForestClassifier(random_state=42)

In [9]:
# This is an alternative model where the target's classes are balanced

rfc2 = RandomForestClassifier(class_weight="balanced", random_state=42)

In [10]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

X = get_word_vectors(X_train)
len(X) == len(X_train)

True

In [11]:
XT = get_word_vectors(X_test)
len(XT) == len(X_test)

True

In [12]:
rfc.fit(X, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
rfc2.fit(X, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [14]:
# Unbalanced classes

# Evaluate on test data
y_pred = rfc.predict(XT)
accuracy_score(y_test, y_pred)

0.576271186440678

In [15]:
# Let's use a confusion matrix to check how accurately the model classified 
# the test set

# The order of the classes are:
# hybrid - indica - sativa

confusion_matrix(y_test, y_pred, labels=["hybrid", "indica", "sativa"])

array([[74, 41, 29],
       [35, 90,  4],
       [32,  9, 40]])

In [16]:
# Balanced classes

# Evaluate on test data
y_pred2 = rfc2.predict(XT)
print(accuracy_score(y_test, y_pred2))
print()
confusion_matrix(y_test, y_pred2, labels=["hybrid", "indica", "sativa"])

# The class weights aren't unbalanced enough to yield a positive difference 
# with the "class_weight" parameter

0.5677966101694916



array([[69, 41, 34],
       [33, 89,  7],
       [28, 10, 43]])

In [17]:
df['Type'].value_counts(normalize=True)

# Our majority class baseline is 40% and our accuracy surpasses that so we're
# in good shape

hybrid    0.406569
indica    0.365798
sativa    0.227633
Name: Type, dtype: float64

In [18]:
# Let's test some predictions

sample_request = ['Happy']
custom = get_word_vectors(sample_request)
rfc.predict(custom)

array(['hybrid'], dtype=object)

In [19]:
sample_request2 = ['Uplifted,Happy,Relaxed,Energetic,Creative']
custom2 = get_word_vectors(sample_request2)
rfc.predict(custom2)

array(['hybrid'], dtype=object)

In [20]:
# If you want to try out your own sample_request, here are all the unique
# values for feelings that the model will take. Additionally, words that
# are similar to the words listed here (such as 'Calm' instead of 'Relaxed')
# should yield similar results

from collections import Counter
word_counts = Counter()
df['base_tokens'].apply(lambda x: word_counts.update(x))
word_counts.most_common(20)

[('happy', 1466),
 ('relaxed', 1352),
 ('euphoric', 1273),
 ('uplifted', 1146),
 ('sleepy', 612),
 ('creative', 537),
 ('energetic', 476),
 ('focused', 446),
 ('hungry', 392),
 ('tingly', 255),
 ('talkative', 247),
 ('giggly', 211),
 ('aroused', 139),
 ('none', 37),
 ('dry', 1),
 ('mouth', 1)]

In [21]:
# Here are some observations for each specific class:

In [22]:
df.loc[df['Type'] == 'hybrid'][0:3]

Unnamed: 0,Effects,Type,base_tokens
0,Creative Energetic Tingly Euphoric Relaxed,hybrid,"[creative, energetic, tingly, euphoric, relaxed]"
2,Tingly Creative Hungry Relaxed Uplifted,hybrid,"[tingly, creative, hungry, relaxed, uplifted]"
3,Happy Relaxed Euphoric Uplifted Talkative,hybrid,"[happy, relaxed, euphoric, uplifted, talkative]"


In [23]:
df.loc[df['Type'] == 'indica'][0:3]

Unnamed: 0,Effects,Type,base_tokens
5,Relaxed Happy Euphoric Uplifted Giggly,indica,"[relaxed, happy, euphoric, uplifted, giggly]"
7,Relaxed Tingly Happy Euphoric Uplifted,indica,"[relaxed, tingly, happy, euphoric, uplifted]"
9,Relaxed Focused Euphoric Uplifted Aroused,indica,"[relaxed, focused, euphoric, uplifted, aroused]"


In [24]:
df.loc[df['Type'] == 'sativa'][0:3]

Unnamed: 0,Effects,Type,base_tokens
1,Uplifted Happy Relaxed Energetic Creative,sativa,"[uplifted, happy, relaxed, energetic, creative]"
6,Uplifted Focused Happy Talkative Relaxed,sativa,"[uplifted, focused, happy, talkative, relaxed]"
24,Happy Uplifted Euphoric Energetic Relaxed,sativa,"[happy, uplifted, euphoric, energetic, relaxed]"


In [25]:
sample_request3 = ['hyper, drowsy, ecstatic']
custom3 = get_word_vectors(sample_request3)
rfc.predict(custom3)

array(['hybrid'], dtype=object)

In [26]:
sample_request4 = ['this is a stress test of the predictive model']
custom4 = get_word_vectors(sample_request4)
rfc.predict(custom4)

array(['sativa'], dtype=object)

In [27]:
# Pickling the model

# joblib.dump(rfc, 'rfc_lg.joblib')

# Model 2 - Strain Recommendation

In [28]:
dfx = dfnlp[['Effects' ,'Strain']]
dfx.head()

Unnamed: 0,Effects,Strain
0,Creative Energetic Tingly Euphoric Relaxed,100-Og
1,Uplifted Happy Relaxed Energetic Creative,1024
2,Tingly Creative Hungry Relaxed Uplifted,13-Dawgs
3,Happy Relaxed Euphoric Uplifted Talkative,24K-Gold
4,Relaxed Euphoric Happy Uplifted Hungry,3-Kings


In [29]:
dfx['base_tokens'] = dfx['Effects'].apply(tokenize)

doc = nlp(dfx['base_tokens'].to_string())
nlp_vector = doc.vector

X_train2, X_test2, y_train2, y_test2 = train_test_split(dfx['Effects'], 
                                                    dfx['Strain'], 
                                                    # test_size=0.2, 
                                                    test_size=0.000001, 
                                                    # stratify=df['Strain'],
                                                    random_state=42)

XT = get_word_vectors(X_train2)
len(XT) == len(X_train2)

rfc_strains = RandomForestClassifier(random_state=42)

rfc_strains.fit(XT, y_train2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [30]:
sample_requestx1 = ['Happy']
customx1 = get_word_vectors(sample_requestx1)
rfc_strains.predict(customx1)

array(['Double-Tap'], dtype=object)

In [31]:
sample_requestx2 = ['hyper, drowsy, ecstatic']
customx2 = get_word_vectors(sample_requestx2)
rfc_strains.predict(customx2)[0:3]

array(['Purple-Swish'], dtype=object)

In [32]:
# Pickling the model
# joblib.dump(rfc, 'rfc_lg_strain.joblib')

# Code for App Implementation

In [33]:
# import spacy
# import spacy.cli
# spacy.cli.download("en_core_web_lg")
# import en_core_web_lg
# from joblib import load
# 
# nlp = en_core_web_lg.load()
# 
# def get_word_vectors(docs):
#     return [nlp(doc).vector for doc in docs]
# 
# def cann_pred(user_input):
#   request = [f'{user_input}']
#   custom = get_word_vectors(request)
#   output = rfc_lg.predict(custom)[0]
#   # output2 = rfc_lg_strains.predict(custom)[0] # [Stretch goal - 2nd model]
# 
#   if output == 'hybrid':
#     prob = rfc_lg.predict_proba(custom)[0][0]
#   
#   elif output == 'indica':
#     prob = rfc_lg.predict_proba(custom)[0][1]
# 
#   else:
#     prob = rfc_lg.predict_proba(custom)[0][2]
# 
#   return(f"We're {prob*100:.0f}% confident you should try the {output} strain!")
# 
#   # This is a stretch goal
#   # return(f"We're {prob*100:.0f}% confident you should try the {output} strain and {output2} fits your criteria the most!")