In [83]:
from tqdm import tqdm
import pandas as pd
import openai
import praw
import os
import re

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [84]:
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.embeddings import TensorflowHubEmbeddings

In [85]:
REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')

reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)

## Query

In [86]:
query = '''
I got laid off last week. How should I go about finding a new job?
'''

## Topic Retrieval

In [87]:
def generate_topics(query, model="gpt-3.5-turbo"):

    messages = [
        {"role": "user", "content": f"Take this query '{query}' and return a list of 10 simple to understand topics (4 words or less) to input in Search so it returns good results."},
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response["choices"][0]["message"]["content"]

    topics = re.sub(r'^\d+\.\s*', '', response_message, flags=re.MULTILINE).split("\n")

    # Post-processing GPT output

    topics = [topic.strip() for topic in topics]

    topics = [topic[1:-1] if (topic.startswith('"') and topic.endswith('"')) or (topic.startswith("'") and topic.endswith("'")) else topic for topic in topics]

    topics = [re.sub(r'[^a-zA-Z0-9\s]', ' ', topic) for topic in topics]

    return topics

In [88]:
topics = generate_topics(query)

topics

['Job search tips',
 'Resume writing advice',
 'Networking strategies',
 'Interview preparation tips',
 'Online job boards',
 'Career counseling services',
 'Job fairs near me',
 'Freelance opportunities',
 'Remote work options',
 'Industry specific job listings']

## Relevant Comments Retrieval

In [89]:
def get_relevant_subreddits(topics):
    comments = []

    for topic in tqdm(topics):
        for post in reddit.subreddit("all").search(
        topic, limit=10):
            
            post.comment_limit = 20
            post.comment_sort = "top"

            # Top level comments only
            post.comments.replace_more(limit=0)

            for comment in post.comments:
                author = comment.author.name if comment.author else '[deleted]'
                comments.append([post.id, comment.id, post.subreddit.display_name, post.title, author, comment.body])

    comments = pd.DataFrame(comments,columns=['source', 'comment_id', 'subreddit', 'title', 'author', 'text'])

    # Drop empty texts or ["deleted"] texts
    comments = comments[comments['text'].str.len() > 0]
    comments = comments[comments['text'] != "[deleted]"]

    # Drop comments with None authors
    comments = comments[comments['author'] != "AutoModerator"]

    # Drop duplicate ids
    comments = comments.drop_duplicates(subset=['source'])

    return comments

In [90]:
comments = get_relevant_subreddits(topics)

100%|██████████| 10/10 [00:41<00:00,  4.13s/it]


## Answering Query with Langchain

In [91]:
def construct_retriever(comments, k=10):

    # Convert comments dataframe to a dictionary
    comments = comments.to_dict('records')

    # Convert comments["text"] to a list of strings
    texts = [comment["title"] + " " + comment["text"] for comment in comments]

    db = Chroma.from_texts(texts, TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4"), metadatas=[{"source": comment["source"], "comment_id": comment["comment_id"], "author": comment["author"]} for comment in comments])

    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    return retriever

In [92]:
retriever = construct_retriever(comments)



In [93]:
def construct_qa(retriever):
    qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, return_source_documents=True)
    return qa

In [95]:
chat_history = []
qa = construct_qa(retriever)

In [96]:
result = qa({"question": query, "chat_history": chat_history})

In [97]:
result["answer"]

" Start by updating your resume and LinkedIn profile to reflect your current skills and experience. Then, look for job postings on job boards like Indeed and LinkedIn, as well as industry-specific job listings. You can also reach out to people you know professionally, such as former colleagues or mentors, to see if they know of any job opportunities. Finally, consider using a staffing agency to help you find a job that's a great fit. Good luck!"

In [98]:
result["source_documents"]

[Document(page_content='Request: job search tips WAITING FOR THE RIGHT JOB WITH DECENT PAY > QUICK JOB', metadata={'source': '122mdcc', 'comment_id': 'jdv57nv', 'author': 'notenoughbeds'}),
 Document(page_content='Where to look for jobs? Online job boards? LinkedIn\n\nIndeed\n\nIf you need a... beginners job. Craigslist has a work section', metadata={'source': 'uj35l8', 'comment_id': 'i7gbuat', 'author': 'No-Statement-3019'}),
 Document(page_content='Job search tips in Canada You’re selling yourself way too cheap. Look for a senior position and then people will want to hire you more too', metadata={'source': '1156zsa', 'comment_id': 'j91mo9r', 'author': 'pxpxy'}),
 Document(page_content='Job search tips Just apply, covid has made super easy for RTs to get jobs', metadata={'source': '11uekx5', 'comment_id': 'jcnvfnx', 'author': 'Crass_Cameron'}),
 Document(page_content='Looking for freelance opportunities >\tWhere would be the best place to look?\n\nPeople you’ve worked with professiona