In [1]:
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, ConnectionError

es_client = Elasticsearch('http://localhost:9200')


# Load environment variables from the .env file
load_dotenv()

True

In [3]:
import os
import json

import numpy as np
import pandas as pd

current_file_dir = os.getcwd()
root_data_dir = os.path.join(os.path.dirname(current_file_dir), 'data')
app_files_dir = os.path.join(root_data_dir, 'pipelines-data')

print(os.listdir(app_files_dir))

def get_pytorch_model(models_dir, model_name='multi-qa-distilbert-cos-v1'):
  from sentence_transformers import SentenceTransformer

  model_path = os.path.join(models_dir, model_name)

  if not os.path.exists(model_path):
      print('huggingface model loading...')
      embedder = SentenceTransformer(model_name)
      embedder.save(model_path)
  else:
      print('pretrained model loading...')
      embedder = SentenceTransformer(model_name_or_path=model_path)
  print('model loadind done')

  return embedder

class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [{'score': scores[i], 'doc': self.documents[i]} for i in idx]


models_dir = os.path.join(app_files_dir, 'models')

index_file_path = os.path.join(models_dir, 'embeds_index.json')
embeds_file_path = os.path.join(models_dir, 'embeds.npy')
with open(index_file_path, 'r') as f:
    index = json.load(f)
embeds = np.load(embeds_file_path)
print(embeds.shape, len(index))

search_engine = VectorSearchEngine(documents=index, embeddings=embeds)


['content_queries.json', 'golden_dataset.json', 'content_reviews_green_bro.json', 'models', 'api_db.csv', 'content_green_bro.csv', 'content_db_df.csv', 'ground_truth.json']
(291, 768) 291


In [4]:
models_dir = os.path.join(root_data_dir, 'models')

embedder = get_pytorch_model(models_dir)
print(embedder)

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



pretrained model loading...
model loadind done
SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


Zincsearch

In [15]:
import os
import requests
import json

import yaml


ZINCSEARCH_URL = "http://localhost:4080"
USERNAME='admin'
PASSWORD='admin'

def create_index(index_config):
    index_name = 'greenbro-content'
    url = f"{ZINCSEARCH_URL}/api/index"
    headers = {"Content-Type": "application/json"}
    payload = {
        "name": index_name,
        "storage_type": "disk",  # or 'memory' if you want in-memory storage
        "body": index_config
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload), auth=(USERNAME, PASSWORD))
    if response.status_code == 200:
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Failed to create index: {response.status_code}, {response.text}")


current_file_dir = os.getcwd()
root_dir = os.path.dirname(current_file_dir)
config_path = os.path.join(root_dir, 'assets', 'data_config.yml')
config = {}
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
index_config = config['elastic_index_settings']
index_config.pop('settings')
create_index(index_config)

Failed to create index: 400, {"error":"index [greenbro-content] already exists"}


In [25]:
import pandas as pd
from typing import List

def load_bulk_documents(documents):
    index_name = 'greenbro-content'
    url = f"{ZINCSEARCH_URL}/api/_bulk"
    headers = {"Content-Type": "application/x-ndjson"}
    
    bulk_data = ""
    for doc in documents:
        meta_data = {"index": {"_index": index_name}}
        bulk_data += json.dumps(meta_data) + "\n"
        bulk_data += json.dumps(doc) + "\n"

    response = requests.post(url, headers=headers, data=bulk_data, auth=(USERNAME, PASSWORD))
    if response.status_code == 200:
        print("Bulk documents loaded successfully.")
    else:
        print(f"Failed to load bulk documents: {response.status_code}, {response.text}")


def read_csv_as_dicts(root_dir) -> List:
    result_filename = os.path.join(root_dir, 'data', 'pipelines-data', config['content_file_name'])
    df = pd.read_csv(result_filename)
    df['category'] = 'flower'
    csv_entries = df.to_dict(orient='records')
    print('Num entries: %d' % len(csv_entries))
    return csv_entries

def search_documents(query, limit = 10):
    index_name = 'greenbro-content'
    url = f"{ZINCSEARCH_URL}/api/{index_name}/_search"
    headers = {"Content-Type": "application/json"}
    search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["relief", "positive_effects", "flavours"],
                            "type": "best_fields"
                        }
                    },
                    "filter": {  # TODO: add filter as parameter
                        "term": {
                            "category": "flower"
                        }
                    }
                }
            }
        }

    response = requests.post(url, headers=headers, data=json.dumps(search_query), auth=(USERNAME, PASSWORD))
    if response.status_code == 200:
        return response.json()['hits']['hits'][:limit]
    else:
        print(f"Search failed: {response.status_code}, {response.text}")

def pretty(search_results):
    result_docs = []

    include_fields = ['title', 'tags', 'relief', 'positive_effects', 'flavours']
    for hit in search_results:
        result_docs.append({k: v for k, v in  hit['_source'].items() if k in include_fields})
    return result_docs

#index_entries = read_csv_as_dicts(root_dir)
#load_bulk_documents(index_entries)
res = search_documents('Lemon, headache')
print(pretty(res))

[{'flavours': 'citrus', 'positive_effects': 'euphoria happy uplifting', 'relief': 'chronic pain depression insomnia stress', 'tags': 'Sativa Dominant|80% Sativa/20% Indica', 'title': 'King Tut'}, {'flavours': 'candy citrus diesel fruity sour sweet', 'positive_effects': 'aroused calming creative happy hungry relaxing', 'relief': 'cramps depression fatigue pms ptsd stress', 'tags': 'Indica Dominant|70% Indica/30% Sativa', 'title': 'Permanent Marker'}, {'flavours': 'citrus orange spicy sweet tangy', 'positive_effects': 'creative euphoria focus happy', 'relief': 'anxiety chronic pain depression fatigue loss of appetite migraines nausea pms ptsd stress', 'tags': 'Sativa Dominant|80% Sativa/20% Indica', 'title': 'Orange Crush'}, {'flavours': 'coffee herbal sweet woody', 'positive_effects': 'body high hungry relaxing tingly uplifting', 'relief': 'chronic pain depression fatigue nausea', 'tags': 'Hybrid,|50%/50%', 'title': 'Kush Mint'}, {'flavours': 'berry fruity herbal spicy', 'positive_effec

In [23]:
def reciprocal_rank_fusion(ranked_lists, k=60):
    """
    Implements Reciprocal Rank Fusion (RRF).
    
    Args:
        ranked_lists (list of lists): A list of ranked lists, where each sublist contains document IDs in rank order.
        k (int): The constant used in the RRF score calculation. Default is 60.
    
    Returns:
        dict: A dictionary where keys are document IDs and values are their RRF scores.
    """
    scores = {}

    # Iterate through each ranked list
    for ranked_list in ranked_lists:
        for rank, doc_id in enumerate(ranked_list):
            rrf_score = 1 / (k + rank + 1)
            if doc_id in scores:
                scores[doc_id] += rrf_score
            else:
                scores[doc_id] = rrf_score
    
    # Sort documents by their accumulated RRF score in descending order
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
    
    return sorted_scores

{'total': {'value': 291},
 'max_score': 1,
 'hits': [{'_index': 'greenbro-content',
   '_type': '_doc',
   '_id': '2bcCn0tkcon',
   '_score': 1,
   '@timestamp': '2024-09-01T15:10:14.619757056Z',
   '_source': {'@timestamp': '2024-09-01T15:10:14.619757056Z',
    'avg_rating': 4.5,
    'category': 'flower',
    'flavours': 'citrus orange spicy sweet tangy',
    'item_name': 'orange-crush',
    'num_ratings': 36,
    'positive_effects': 'creative euphoria focus happy',
    'relief': 'anxiety chronic pain depression fatigue loss of appetite migraines nausea pms ptsd stress',
    'tags': 'Sativa Dominant|80% Sativa/20% Indica',
    'title': 'Orange Crush'}},
  {'_index': 'greenbro-content',
   '_type': '_doc',
   '_id': '2bcCn0zhM7F',
   '_score': 1,
   '@timestamp': '2024-09-01T15:10:14.640516864Z',
   '_source': {'@timestamp': '2024-09-01T15:10:14.640516864Z',
    'avg_rating': 4.9,
    'category': 'flower',
    'flavours': 'candy citrus diesel fruity sour sweet',
    'item_name': 'perma

# Keyword search

In [244]:
def elastic_search(es_client, query, limit=3, filter=None, random=False):
    index_name = 'greenbro-content'
    if random:
        search_query = {
          "size": 10,
          "query": {
            "function_score": {
              "functions": [
                {
                  "random_score": {}
                }
              ]
            }
          }
        }
    else:
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["relief", "positive_effects", "flavours"],
                            "type": "best_fields"
                        }
                    },
                    "filter": {  # TODO: add filter as parameter
                        "term": {
                            "category": "flower"
                        }
                    }
                }
            }
        }
    response = es_client.search(index=index_name, body=search_query)
    
    return response['hits']['hits'][:limit]

def pretty(search_results):
    result_docs = []

    include_fields = ['title', 'tags', 'relief', 'positive_effects', 'flavours']
    for hit in search_results:
        result_docs.append({k: v for k, v in  hit['_source'].items() if k in include_fields})
    return result_docs

try:
    query = 'Lemon, headache'
    
    res = elastic_search(es_client, query, random=True)
    sample = res[0]['_source']
    # print(sample.keys())
    print()
    print(pretty(res))
except ConnectionError:
    print('Elastic unreachable')


[{'title': 'OG 18', 'tags': 'Indica Dominant|70% Indica/30% Sativa', 'relief': 'arthritis chronic pain depression fibromyalgia headaches inflammation insomnia migraines', 'positive_effects': 'euphoria happy', 'flavours': 'diesel pine sweet'}, {'title': 'Peanut Butter Breath', 'tags': 'Hybrid,|50%/50%', 'relief': 'chronic pain depression loss of appetite nausea stress', 'positive_effects': 'body high hungry relaxing uplifting', 'flavours': 'herbal pine woody'}, {'title': 'Purple Sunset', 'tags': 'Indica Dominant|60% Indica/40% Sativa', 'relief': 'add/adhd anxiety depression stress', 'positive_effects': 'calming creative hungry relaxing uplifting', 'flavours': 'citrus fruity orange sour spicy sweet'}]


# Hybrid search

In [None]:
import requests

def get_embed(query):
    url = "http://localhost:8000/embed"
    data = {"text": query}
    response = requests.post(url, json=data)
    if response.status_code == 200:
        res = np.array(json.loads(response.json()['embed']))
    else:
        print("Error:", response.status_code, response.text)
        raise RuntimeError
    return res

def elastic_hybrid_search(es_client, query, limit=3, filter=None, random=False):
    index_name = 'greenbro-content'
    vector_search_term = get_embed(query)
    knn_query = {
        "field": "text_vector",
        "query_vector": vector_search_term,
        "k": 5,
        "num_candidates": 100
    }
    
    response = es_client.search(
        index=index_name,
        query={
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["relief", "positive_effects", "flavours"],
                            "type": "best_fields"
                        }
                    },
                    "filter": {  # TODO: add filter as parameter
                        "term": {
                            "category": "flower"
                        }
                    }
                }
        },
        knn=knn_query,
        size=5
    )
    return response['hits']['hits'][:limit]


try:
    query = 'Lemon, headache'
    
    res = elastic_hybrid_search(es_client, query, random=True)
    sample = res[0]['_source']
    #print(sample.keys())
    #print()
    print(pretty(res))
except ConnectionError:
    print('Elastic unreachable')

In [251]:
[{'score': i['_score'], 'res': i['_source']['item_name']} for i in res]

[{'score': 0.7898849, 'res': 'lemon-tree'},
 {'score': 0.77863276, 'res': 'golden-lemons'},
 {'score': 0.7349596, 'res': 'lemonhead'}]

In [None]:


[{'score': scores[i], 'doc': self.documents[i]} for i in idx]

# Prepare data

In [211]:
import os
import pandas as pd

current_file_dir = os.getcwd()
root_data_dir = os.path.join(os.path.dirname(current_file_dir), 'data')
app_files_dir = os.path.join(root_data_dir, 'pipelines-data')

print(os.listdir(app_files_dir))

res_csv_path = os.path.join(app_files_dir, 'content_green_bro.csv')

content_db = pd.read_csv(res_csv_path)

['content_queries.json', 'golden_dataset.json', 'content_reviews_green_bro.json', 'models', 'api_db.csv', 'content_green_bro.csv', 'content_db_df.csv', 'ground_truth.json']


In [4]:
content_db.head()

Unnamed: 0,title,item_name,tags,relief,positive_effects,flavours,avg_rating,num_ratings
0,Casey Jones,casey-jones,Sativa Dominant|80% Sativa/20% Indica,add/adhd arthritis bipolar disorder chronic pa...,creative euphoria focus happy,citrus fruity lemon sweet,4.6,38
1,Gorilla Glue #4,gorilla-glue-4,Indica Dominant|60% Indica/40% Sativa,add/adhd bipolar disorder chronic pain depress...,body high euphoria happy uplifting,chemical chocolate coffee diesel pine sweet,4.6,339
2,Cherry Pie,cherry-pie,Indica Dominant|80% Indica/20% Sativa,add/adhd anxiety bipolar disorder chronic pain...,creative euphoria happy relaxing,berry cherry sweet,4.5,110
3,Amnesia Haze,amnesia-haze,Sativa Dominant|80% Sativa/20% Indica,add/adhd anxiety cancer chronic pain depressio...,creative euphoria giggly happy,citrus lemon sweet,4.5,55
4,Blueberry,blueberry,Indica Dominant|80% Indica/20% Sativa,add/adhd bipolar disorder depression insomnia ...,euphoria happy sleepy,berry blueberry sweet woody,4.5,95


Data for service

In [189]:
def add_url(postfix):
    url = os.path.join('https://www.allbud.com' + content_db_full.iloc[0]['link'])

content_db_full = pd.read_csv(os.path.join(app_files_dir, 'content_db_df.csv'))
content_db_full['url'] = content_db_full['link'].apply(lambda x: 'https://www.allbud.com' + x)
content_db = content_db_full[['url', 'item_name']].merge(content_db, on='item_name')

content_db.to_csv(os.path.join(app_files_dir, 'api_db.csv'), index=False)

In [192]:
def get_candidates(content_names_list):
    res = [
        {
            'title': row['title'], 'url': row['url'],
            'explanation': f"{row['tags']}: {row['positive_effects']}",
            'flavours': row['flavours']
        }
        for _, row in content_db[content_db['item_name'].isin(content_names_list)].iterrows()
    ]
    return res

content_names_list = ['blueberry', 'amnesia-haze', 'cherry-pie']

candidates = get_candidates(content_names_list)

candidates

[{'title': 'Cherry Pie',
  'url': 'https://www.allbud.com/marijuana-strains/indica-dominant-hybrid/cherry-pie',
  'explanation': 'Indica Dominant|80% Indica/20% Sativa: creative euphoria happy relaxing',
  'flavours': 'berry cherry sweet'},
 {'title': 'Amnesia Haze',
  'url': 'https://www.allbud.com/marijuana-strains/sativa-dominant-hybrid/amnesia-haze',
  'explanation': 'Sativa Dominant|80% Sativa/20% Indica: creative euphoria giggly happy',
  'flavours': 'citrus lemon sweet'},
 {'title': 'Blueberry',
  'url': 'https://www.allbud.com/marijuana-strains/indica-dominant-hybrid/blueberry',
  'explanation': 'Indica Dominant|80% Indica/20% Sativa: euphoria happy sleepy',
  'flavours': 'berry blueberry sweet woody'}]

In [204]:
os.listdir(app_files_dir)

['content_queries.json',
 'golden_dataset.json',
 'content_reviews_green_bro.json',
 'models',
 'api_db.csv',
 'content_green_bro.csv',
 'content_db_df.csv',
 'ground_truth.json']

In [217]:
models_dir = os.path.join(root_data_dir, 'pipelines-data', 'models')
index_file_path = os.path.join(models_dir, 'embeds_index.json')
embeds_file_path = os.path.join(models_dir, 'embeds.npy')
with open(index_file_path, 'r') as f:
    index = json.load(f)
embeds = np.load(embeds_file_path)
db = {}
for i, embed in enumerate(embeds):
    db[index[i]] = embed
print(db[index[-1]])

In [26]:
class VectorDB:
    def __init__(self, index, embeddings):
        self.db = {}
        for i, embed in enumerate(embeddings):
            self.db[index[i]] = embed

    def get_item_vector(self, item_name):
        return self.db[item_name]


def get_vector_db(root_dir):
    models_dir = os.path.join(root_data_dir, 'pipelines-data', 'models')
    index_file_path = os.path.join(models_dir, 'embeds_index.json')
    embeds_file_path = os.path.join(models_dir, 'embeds.npy')
    with open(index_file_path, 'r') as f:
        index = json.load(f)
    embeds = np.load(embeds_file_path)
    print(embeds.shape, len(index))
    vector_db = VectorDB(index=index, embeddings=embeds)
    return vector_db

vector_db = get_vector_db(current_file_dir)
# vector_db.get_item_vector('gorilla-glue-4')

NameError: name 'root_data_dir' is not defined

In [221]:
vector_db.db

{}

In [209]:
os.listdir('/Users/username/PycharmProjects/leaf-bro/jupyter_notebooks/data/pipelines-data/models')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/username/PycharmProjects/leaf-bro/jupyter_notebooks/data/pipelines-data/models'

In [201]:
import json
import datetime
import hashlib
import random

import numpy as np
import backoff
import openai
from openai import OpenAI


client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"]
)


@backoff.on_exception(backoff.expo, openai.APIError)
@backoff.on_exception(backoff.expo, openai.RateLimitError)
@backoff.on_exception(backoff.expo,openai.Timeout)
@backoff.on_exception(backoff.expo, RuntimeError)
def gpt_query(gpt_params, verbose: bool = False, avoid_fuckup: bool = False) -> dict:
    print('connecting OpenAI...')
    if verbose:
        print(gpt_params["messages"][1]["content"])
    response = client.chat.completions.create(
        **gpt_params
    )
    gpt_response = response.choices[0].message.content
    if avoid_fuckup:
        if '[' in gpt_response or '?' in gpt_response or '{' in gpt_response:
            raise RuntimeError
    res = {'recs': gpt_response}
    res.update({'prompt_tokens': response.usage.completion_tokens, 'prompt_tokens': response.usage.prompt_tokens, 'total_tokens': response.usage.total_tokens})
    seed_phrase = f'{str(datetime.datetime.now().timestamp())}{gpt_response}'
    generation_id = str(hashlib.md5(seed_phrase.encode('utf-8')).hexdigest())[:12]
    res.update({'id': generation_id})
    return res

def random_shuffle(input_list):
    """
    my_list = ['apple', 'banana', 'cherry', 'date', 'fig']
    random_shuffle(my_list)
    """
    res = random.shuffle(input_list)
    return ', '.join(input_list)

def promt_generation(candidates):
  # TODO: use jinja2
  flavours_list = ['sweet', 'citrus', 'fruity', 'spicy', 'berry', 'pine', 'herbal', 'sour', 'lemon', 'woody', 'grape', 'tropical', 'diesel', 'skunky', 'nutty', 'blueberry', 'vanilla', 'creamy', 'candy', 'cherry']
  reliefs_list = ['depression', 'chronic pain', 'stress', 'insomnia', 'anxiety', 'fatigue', 'nausea', 'migraines', 'muscle spasms', 'headaches', 'ptsd', 'loss of appetite', 'add/adhd', 'inflammation', 'arthritis', 'bipolar disorder', 'cramps', 'pms', 'fibromyalgia', 'gastrointestinal disorder']
  promt = f"""
      generate 5 search query based on strain description below. All queries should be 3-5 words long. Do not include strain name.
      Do not add word "strain".
      Base query on flavours (not connstrained with this list): {random_shuffle(flavours_list)}
      reliefs: {random_shuffle(reliefs_list)}
      effects (body and mind)
      {candidates}
      Queries:
  """
  return promt

def promt_generation_2(candidates, query):
  promt = f"""
      Below you can find items with description in format `title: description`
      {candidates}
      Rerank items and return reranked item ids base on user query. Return only reranked items, comma-separate
      Do not add any explanation, just result
      User query: {query}
      expected result: [title, title, title]
      reranked:
  """
  return promt

def generate(gpt_prompt, verbose=False):
    gpt_params = {
        'model': 'gpt-3.5-turbo',
        'max_tokens': 500,
        'temperature': 0.7,
        'top_p': 0.5,
        'frequency_penalty': 0.5,
    }
    # gpt_promt = promt_generation(gen_candadates(db, ids))
    if verbose:
        print(gpt_promt)
    messages = [
        {
          "role": "system",
          "content": "You are a helpful assistant for medicine shopping",
        },
        {
          "role": "user",
          "content": gpt_prompt,
        },
    ]
    gpt_params.update({'messages': messages})
    res = gpt_query(gpt_params, verbose=False)
    return res


def load_reviews(reviews_file_path):
    with open(reviews_file_path, 'r') as f:
        reviews_dict = json.load(f)
    return reviews_dict

def get_item_reviews(item_name, reviews_dict):
    reviews = [i['review'].replace('\n', ' ') for i in reviews_dict if i['item_name'] == item_name]
    return reviews

def aggregate_reviews(content_df, reviews_dict):
    res = {}
    for name in content_df['item_name'].unique():
      res[name] = {'reviews': ''.join(get_item_reviews(name, reviews_dict))}
    return res

def test_generation(reviews_agregated, random_strain):
    prompt = promt_generation(reviews_agregated[random_strain]['reviews'])
    generated_result = generate(prompt)
    print(generated_result['recs'])
    print()
    reviews_agregated[random_strain]['reviews']

def test_reranking(candidates, query):
    [f"{i['title']}: {i['explanation']}, {i['flavours']}" for i in candidates]
    prompt = promt_generation_2(candidates, query)
    generated_result = generate(prompt)
    print(generated_result['recs'])

# reviews_dict = load_reviews(os.path.join(app_files_dir, 'content_reviews_green_bro.json'))
# reviews_agregated = aggregate_reviews(content_db, reviews_dict)
# # print(reviews_agregated['fire-og']['reviews'])

# random_strain = np.random.choice(list(reviews_agregated.keys()))
# test_generation(reviews_agregated, random_strain)

test_reranking(candidates, 'avoid cherry')

connecting OpenAI...
Blueberry, Amnesia Haze


In [203]:
a = []
a[0]

IndexError: list index out of range

Search index

In [71]:
key = list(reviews_agregated.keys())[0]
print(key)
reviews_agregated[key]

casey-jones


{'reviews': " First off, an elevation of my mood, a little bit of rejuvenation to my body, and even my thoughts, I noticed quickly while smoking CJ. I was feeling a little sluggish before I smoked this strain. No more. The taste has some earthiness to it, but mostly that sweet citrus taste. If you're fond of sativas I would recommend trying CJ. Smoke this pot.  I got an eighth of Casey Jones and it kicks some serious backside. I felt very refreshed taking a puff of this light green, fuzzy bud and got notes of tangerine, citrus and even a bit of watermelon. I highly recommend Casey Jones as a daytime strain to inspire creativity and socialization.  I didn't have the pleasure of growing her, but I bought a half oz of 'b' buds and squished it. What a sweet strain. Sweet as in sugar sweet. It came out of the press as foam mostly it was so sugary. Then I burned it 1 time and it turned Into beautiful thick amber plastic. For a low-level thc strain, I was very satisfied. Ty  This strain is ph

Generate ground trurh

In [66]:
from IPython.display import clear_output

def generate_and_save_ground_truth(input_dict, output_filename):
    for i in input_dict:
      reviews = input_dict[i]['reviews']
      prompt = promt_generation(reviews)
      input_dict[i]['queries'] = generate(prompt)
    with open(output_filename, 'w') as f:
      json.dump(input_dict, f)
    print('Data generated')
    return input_dict

clear_output()
output_filename = os.path.join(app_files_dir,'content_queries.json')
user_queries_dict = generate_and_save_ground_truth(reviews_agregated, output_filename)
user_queries_dict[random_strain]

{'reviews': " My first time with the Chapo, was a multi-high experience the flavor, omg…. The throat on a few hits reminded me of the drainage you’d get from decent quality “coke”, very sweet head high, spurts of clear thought, energy, then my favorite “Couch Lock”, and the world for now ceases.  Smoked a pre-roll of el chapo OG and fucking hell my eyes got sooo heavy and got baked asf!  I have a chronic disease this indica strain is a cure all fo me and I’m a long time smoker!!  I really enjoy this strains ability to reduce pain.  El Chapo OG Kush (Face/Off OG x SFV OG x OG Kush) by Boss Status Genetics/Cali Buds.Average Cannabinoid Content: THC: 10-23%CBD: 0.00-0.50%Terpene Profile:Myrcene, Limonene, Pinene, beta-Caryophyllene Medicinal Benefits include:pain relief gastrointestinal issues nausea relief insomnia relief relaxation appetite stimulationdepression relief  Superb! The best strain I've had in 2019 except that Presidential Kush. Grade A!  El Chapo OG (AKA: Shorty) is an Indi

In [None]:

output_file_path = os.path.join(app_files_dir,'ground_truth.json')

In [68]:
#queries_dict = load_reviews(output_filename)
random_strain = np.random.choice(list(reviews_agregated.keys()))

print(queries_dict[random_strain]['queries']['recs'])

1. Fruity pain relief options
2. Energizing headache remedy
3. Relaxing stress relief choices
4. Uplifting mood support picks
5. Cerebral anxiety solutions


In [54]:
import re

def clean_text(text):
    cleaned_text = re.sub(r'^\d+\.\s*', '', text)
    return cleaned_text

def prepare_ground_truth(queries_dict, data_path):
    ground_truth = []
    
    for k in queries_dict:
        queries = queries_dict[k]['queries']['recs'].split('\n')
        ground_truth += [{'answer': k, 'query': clean_text(q)} for q in queries]
    print('Num generates: %d' % len(ground_truth))
    with open(data_path, 'w') as f:
      json.dump(ground_truth, f)
    print('data saved to %s' % data_path)
    return ground_truth
output_file_path = os.path.join(app_files_dir,'ground_truth.json')
ground_truth = prepare_ground_truth(queries_dict, output_file_path)

Num generates: 1455
data saved to /Users/username/PycharmProjects/leaf-bro/data/pipelines-data/ground_truth.json


In [56]:
import numpy as np

random_id = np.random.randint(0, len(ground_truth))
print(ground_truth[random_id])

queries_dict[ground_truth[random_id]['answer']]['reviews']

{'answer': 'gorilla-cookies', 'query': 'Citrus herbal pain relief'}


" Great high that is good for a night downtown that lasts for hours  This one hits me more like an Indica than a Sativa despite it being Sativa leaning. I get the heavy couch-locking feeling of the Gorilla Glue with the euphoric aspect (mentally and physically) from the GSC, along with that nice head buzz. While remaining clear-headed, I do find myself struggling to keep my eyes open after a good few dabs.  Awesome strain I was bouncing around getting things done Happy and care free Pretty dumb to be honest but didnt care  After consuming gorilla cookies thc oil in a vape device, numerous times, as a medical & recently recreational user, gorilla cookies is an extremely high sativa dominant smoke that in my case gives me plenty of nervous energy but no focus to get things done! Slight couch lock that's easily overcome for most, I personally didn't like the restless energy & feelings of anxiety gorilla cookies always produces. Your experience may differ!I didn't feel any other benefits s

# Project

find a strain and motivate user to buy it

In [76]:
def prepare_catalog(reviews_agregated):
    index = []
    corpus = []
    for key, content in reviews_agregated.items():
        index.append(key)
        corpus.append(content['reviews'])
    return index, corpus
index, corpus = prepare_catalog(reviews_agregated)
print(len(index), len(corpus))

291 291


(291, 768) 291


In [27]:
def get_pytorch_model(models_dir, model_name='multi-qa-distilbert-cos-v1'):
  from sentence_transformers import SentenceTransformer

  model_path = os.path.join(models_dir, model_name)

  if not os.path.exists(model_path):
      print('huggingface model loading...')
      embedder = SentenceTransformer(model_name)
      embedder.save(model_path)
  else:
      print('pretrained model loading...')
      embedder = SentenceTransformer(model_name_or_path=model_path)
  print('model loadind done')

  return embedder

embedder = get_pytorch_model(models_dir)
embedder

NameError: name 'models_dir' is not defined

NameError: name 'app_files_dir' is not defined

In [91]:
user_question = 'citrus relaxation'

v = embedder.encode(user_question)

search_engine.search(v, num_results=5)

[{'score': 0.4901401, 'doc': 'lemon-tree'},
 {'score': 0.47702503, 'doc': 'golden-lemons'},
 {'score': 0.47451043, 'doc': 'hardcore-og'},
 {'score': 0.45465428, 'doc': 'orange-crush'},
 {'score': 0.4432276, 'doc': 'skywalker-og'}]

In [129]:
random_numbers = np.random.choice(np.arange(0, len(ground_truth) - 1), size=100, replace=False)
res = []
for random_id in random_numbers:
    origin = ground_truth[random_id]['answer']
    query = ground_truth[random_id]['query']
    v = embedder.encode(query)
    res.append( 1 if sum(origin==i['doc'] for i in search_engine.search(v, num_results=30)) > 0 else 0)

print(sum(res), len(res))

39 100


In [131]:
[ground_truth[i] for i in random_numbers[np.where(res==1)]]


  [ground_truth[i] for i in random_numbers[np.where(res==1)]]


[]

In [166]:
def eval_hitrate(golden_dataset):
    res = []
    for entry in golden_dataset:
        v = embedder.encode(entry['query'])
        res.append( 1 if sum(entry['origin']==i['doc'] for i in search_engine.search(v, num_results=30)) > 0 else 0)
    print('Hit rate: %.3f, num entries: %d' % (sum(res)/len(res), len(res)))

def generate_golden_set():
    items_list = list(set([i['answer'] for i in ground_truth]))
    random.shuffle(items_list)
    random.shuffle(ground_truth)
    result = []
    test_queries = []
    for i in items_list[:100]:
        result.append(i)
        test_queries.append([j['query'] for j in ground_truth if j['answer']==i][0])
    golden_dataset = [{'origin': origin, 'query': query} for origin, query in zip(result, test_queries)]
    return golden_dataset

data_path = os.path.join(app_files_dir, 'golden_dataset.json')
golden_dataset = [{'origin': origin, 'query': query} for origin, query in zip(result, test_queries)]
with open(data_path, 'w') as f:
  json.dump(golden_dataset, f)

In [160]:
[i['answer'] for i in np.array(ground_truth)[np.where(np.array(res)==1)]]

['alaskan-thunder-fuck',
 'key-lime-pie',
 'la-confidential',
 'grape-pie',
 'monster-cookies',
 'white-og',
 'purple-punch',
 'island-sweet-skunk',
 'gobbstopper',
 'gelato-cake',
 'white-widow',
 'chocolate-diesel',
 'lavender-jones',
 'sour-berry',
 '101-headband',
 'super-boof',
 'sfv-og',
 'cherry-pie',
 '101-headband',
 'chocolate-diesel',
 'kosher-kush',
 'granddaddy-purple',
 'master-kush',
 'zoap',
 'papaya-cake',
 'tahoe-og',
 'sour-tangie',
 'lemon-skunk',
 'slurricane',
 'zkittlez-cake']

In [163]:
# queries_dict['skywalker-og']['reviews']