In [None]:
from semantic_search import SemanticSearch 
import pandas as pd
import tiktoken
import openai
import praw
import os
import re

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [None]:
searcher = SemanticSearch()

In [None]:
REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')

reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)

## Topic Retrieval

In [None]:
def generate_topics(query, model="gpt-3.5-turbo"):

    messages = [
        {"role": "user", "content": f"Take this query '{query}' and return a list of short topics to input in Search so it returns good results. Each topic must stand on its own with respect to the relation of the question."},
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages
    )

    response_message = response["choices"][0]["message"]["content"]

    topics = re.sub(r'^\d+\.\s*', '', response_message, flags=re.MULTILINE).split("\n")

    return topics

In [None]:
query = "Where are some nice places where I can work remotely in Malta?"

In [None]:
topics = generate_topics(query)
topics = [topic.strip() for topic in topics]
print(topics)

## Relevant Subreddits Retrieval

In [None]:
posts = []

for topic in topics:
    for post in reddit.subreddit("all").search(
    topic, limit=200):
        posts.append([post.title, post.subreddit, post.selftext])

posts = pd.DataFrame(posts,columns=['title', 'subreddit', 'text'])

# Segments is title, text and subreddit at the end
segments = (posts['title'] + ' ' + posts['subreddit'].astype(str)).tolist()

In [None]:
searcher.fit(segments, n_neighbors=5)

In [None]:
# TODO: Add distance check here
subreddits = set([result.split()[-1] for result in searcher(query)])

# Convert to string and "+" in between
subreddits = "+".join(subreddits)

print(f"Relevant subreddits: {subreddits}")

## Relevant Posts Retrieval

In [None]:
segments = []
segment_length = 100


for topic in topics:
    for post in reddit.subreddit(subreddits).search(
        topic, limit=50):
            
            comments = ""

            post.comments.replace_more(limit=3)
            for comment in post.comments.list():
                if comment.body != "[deleted]":
                    comments += comment.body + "\n"

            words = comments.split()
            segments.extend([post.title + " " + post.id + "\n" + ' '.join(words[i:i+segment_length]) for i in range(0, len(words), segment_length)])

In [None]:
searcher.fit(segments, n_neighbors=5)

## Answering the Query

In [None]:
def num_tokens(text, model):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [None]:
def form_query(query, model, token_budget):

    relevant_segments = searcher(query)

    introduction = 'Use the below segments from multiple Reddit posts to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." Cite each sentence using the [postid] notation found at the start of each segment. Every sentence MUST have a citation!\n\n'

    message = introduction

    query = f"\n\nQuestion: {query}"

    evidence = []

    for i, result in enumerate(relevant_segments):
        if (
            num_tokens(message + result + query, model=model)
            > token_budget
        ):
            break
        else:
            result = result + "\n\n"
            message += result
            evidence.append(result.split("\n")[0])

    evidence = list(set(evidence))

    return message + query, evidence

In [None]:
def generate_answer(query, model, token_budget, temperature):
    
    message, evidence = form_query(query, model, token_budget)

    messages = [
        {"role": "user", "content": message},
    ]

    print(message)

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    
    response_message = response["choices"][0]["message"]["content"]

    return response_message, evidence

In [None]:
answer, evidence = generate_answer(query, "gpt-3.5-turbo", 1000, 0)

In [None]:
query

In [None]:
answer