In [94]:
import pandas as pd
import openai
import praw
import os
import re

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [95]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

In [96]:
REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')

reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)

## Topic Retrieval

In [97]:
def generate_topics(query, model="gpt-3.5-turbo"):

    messages = [
        {"role": "user", "content": f"Take this query '{query}' and return a list of 10 simple to understand topics (3 words or less) to input in Search so it returns good results."},
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response["choices"][0]["message"]["content"]

    topics = re.sub(r'^\d+\.\s*', '', response_message, flags=re.MULTILINE).split("\n")

    return topics

In [108]:
query = "Are we in a recession right now?"

In [109]:
topics = generate_topics(query)
topics = [topic.strip() for topic in topics]
topics = [topic[1:-1] if (topic.startswith('"') and topic.endswith('"')) or (topic.startswith("'") and topic.endswith("'")) else topic for topic in topics]
print(topics)

['Current economic status', 'Recession indicators', 'Unemployment rates', 'GDP growth rate', 'Consumer spending trends', 'Stock market performance', 'Federal Reserve actions', 'Economic stimulus packages', 'Business closures impact', 'Housing market trends']


## Relevant Subreddit Retrieval

In [110]:
posts = []
comments = []

for topic in topics:
    for post in reddit.subreddit("all").search(
    topic, limit=5):
        posts.append([post.id, post.title, post.subreddit, post.selftext])
        post.comments.replace_more(limit=1)

        for comment in post.comments.list():
            posts.append([post.id, post.title, post.subreddit, comment.body])

posts = pd.DataFrame(posts,columns=['source', 'title', 'subreddit', 'text'])

In [111]:
posts["subreddit"] = posts["subreddit"].apply(lambda x: x.display_name)


## Answering Query with Langchain

In [112]:
text = posts["text"].tolist()
text = " ".join(text)

In [113]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(text)

embeddings = OpenAIEmbeddings()

Created a chunk of size 1635, which is longer than the specified 1000
Created a chunk of size 1298, which is longer than the specified 1000
Created a chunk of size 1109, which is longer than the specified 1000
Created a chunk of size 2072, which is longer than the specified 1000
Created a chunk of size 1498, which is longer than the specified 1000
Created a chunk of size 1419, which is longer than the specified 1000
Created a chunk of size 1127, which is longer than the specified 1000
Created a chunk of size 1576, which is longer than the specified 1000
Created a chunk of size 1314, which is longer than the specified 1000
Created a chunk of size 2563, which is longer than the specified 1000
Created a chunk of size 1287, which is longer than the specified 1000
Created a chunk of size 1649, which is longer than the specified 1000
Created a chunk of size 1616, which is longer than the specified 1000
Created a chunk of size 1573, which is longer than the specified 1000
Created a chunk of s

In [114]:
docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{"source": str(i)} for i in range(len(texts))]).as_retriever()

Using embedded DuckDB without persistence: data will be transient


In [115]:
docs = docsearch.get_relevant_documents(query)

In [116]:
docs

[Document(page_content="A recession is always just around the corner, meanwhile the S&P has been making higher lows since October 2022, unemployed in Canada is 5.0%, and GDP hasn’t been negative for two quarters. It is likely to happen. And economists will have to work fast to figure out a solution. You need demand to sustain supply. And without jobs there isn't demand, nor tax revenue.\n\nSo short term: pain is possible.\n\nMedium term: we might benefit from doing less work and getting paid the same.", metadata={'source': '39'}),
 Document(page_content="Don't listen to people who say GDP is the only thing that matters for recessions.  The NBER's definition -- which hasn't changed -- requires economic downturn across the economy in a broad sense.  We probably can't have a recession and very low unemployment at the same time.  So if we have a recession, we will have layoffs, too. [deleted] Most layoffs I've seen headlines for are in tech. While tech is a huge sector, it is important to 

In [117]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
chain.run(input_documents=docs, question=query)

' It is likely that we are in a recession right now.\nSOURCES: 39, 13, 14, 179'