In [41]:
from tqdm import tqdm
import pandas as pd
import datetime
import openai
import praw
import os
import re

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', None)

In [30]:
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.embeddings import TensorflowHubEmbeddings

In [31]:
REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')

reddit = praw.Reddit(client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT)

## Query

In [32]:
query = '''
I got laid off last week. How should I go about finding a new job?
'''

## Topic Retrieval

In [33]:
def generate_topics(query, model="gpt-3.5-turbo"):

    messages = [
        {"role": "user", "content": f"Take this query '{query}' and return a list of 10 simple to understand topics (4 words or less) to input in Search so it returns good results."},
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )

    response_message = response["choices"][0]["message"]["content"]

    topics = re.sub(r'^\d+\.\s*', '', response_message, flags=re.MULTILINE).split("\n")

    # Post-processing GPT output

    topics = [topic.strip() for topic in topics]

    topics = [topic[1:-1] if (topic.startswith('"') and topic.endswith('"')) or (topic.startswith("'") and topic.endswith("'")) else topic for topic in topics]

    topics = [re.sub(r'[^a-zA-Z0-9\s]', ' ', topic) for topic in topics]

    return topics

In [36]:
topics = generate_topics(query)

topics

['Job search tips',
 'Resume writing advice',
 'Networking strategies',
 'Interview preparation tips',
 'Online job boards',
 'Career counseling services',
 'Job fairs near me',
 'Freelance opportunities',
 'Remote work options',
 'Industry specific job listings']

## Relevant Comments Retrieval

In [42]:
def get_relevant_subreddits(topics):
    comments = []

    for topic in tqdm(topics):
        for post in reddit.subreddit("all").search(
        topic, limit=10):
            
            post.comment_limit = 20
            post.comment_sort = "top"

            # Top level comments only
            post.comments.replace_more(limit=0)

            for comment in post.comments:
                author = comment.author.name if comment.author else '[deleted]'
                comments.append([post.id, comment.id, post.subreddit.display_name, post.title, author, comment.body, datetime.datetime.fromtimestamp(comment.created).strftime('%Y-%m')])

    comments = pd.DataFrame(comments,columns=['source', 'comment_id', 'subreddit', 'title', 'author', 'text', 'date'])

    # Drop empty texts or ["deleted"] texts
    comments = comments[comments['text'].str.len() > 0]
    comments = comments[comments['text'] != "[deleted]"]

    # Drop comments with None authors
    comments = comments[comments['author'] != "AutoModerator"]

    # Drop duplicate ids
    comments = comments.drop_duplicates(subset=['source'])

    return comments

In [43]:
comments = get_relevant_subreddits(topics)

100%|██████████| 10/10 [00:40<00:00,  4.05s/it]


In [44]:
comments

Unnamed: 0,source,comment_id,subreddit,title,author,text,date
0,106m8e3,j3hcif8,funny,"Job search tips, update your profile.",saltinstiens_monster,"""Assistant Emperor?""\n\n""Assistant *to the* Emperor.""",2023-01-08 17:15:27
9,10q4pf6,j6nz560,recruitinghell,“Job Search Tips From A Recruiter” thoughts?,[deleted],"1. I work in tech. 99% of the time, tech recruiters don't know anything about the skills the tea...",2023-01-31 19:00:15
19,134wnu5,jih9zzp,blackladies,Does anyone have any good job search tips?,Carolinablue87,I recommend keeping an up to date LinkedIn profile that indicates you're actively searching for ...,2023-05-01 22:52:38
23,12h0uya,jfmy0a1,physicaltherapy,Job search tips,tunaman4u2,"Indeed yes. Sell yourself and don’t take the first offer, be prepared to negotiate. Companies wi...",2023-04-10 02:18:46
28,11uekx5,jcnvfnx,respiratorytherapy,Job search tips,Crass_Cameron,"Just apply, covid has made super easy for RTs to get jobs",2023-03-18 06:02:52
...,...,...,...,...,...,...,...
251,12d4zd4,jf4w9av,Birmingham,Bartender moving from outta state. Does bham restaurant industry use any specific online job sou...,minorujco,"Croux app used to find some gigs in Birmingham before covid, but haven't used it since. I heard ...",2023-04-06 04:13:35
260,yn7bze,iv7ekcc,biotech,Best listings to find a job in biotech/pharma industry?,tomatotornado420,Linked/indeed. Early career process engineer.,2022-11-05 22:50:07
269,13bhzdm,jjbacyo,singapore,Industries must show support for local training for jobs to be added to shortage occupation list...,worldcitizensg,"Sorry.. Why ? If industry want more ""EP"" then more incentive not to train the locals so the job ...",2023-05-08 10:34:15
273,138m3nb,jiypppa,dataisbeautiful,"[OC] Analyzing 15,963 Job Listings to Uncover the Top Skills for Data Analysts (update)",restore_democracy,Excel and SQL? We haven’t come far in 30 years.,2023-05-05 16:02:36


## Answering Query with Langchain

In [15]:
def construct_retriever(comments, k=10):

    # Convert comments dataframe to a dictionary
    comments = comments.to_dict('records')

    # Convert comments["text"] to a list of strings
    texts = [comment["title"] + " " + comment["text"] for comment in comments]

    db = Chroma.from_texts(texts, TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4"), metadatas=[{"source": comment["source"], "comment_id": comment["comment_id"], "author": comment["author"]} for comment in comments])

    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    return retriever

In [16]:
retriever = construct_retriever(comments)



In [17]:
def construct_qa(retriever):
    qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, return_source_documents=True)
    return qa

In [18]:
chat_history = []
qa = construct_qa(retriever)

In [19]:
result = qa({"question": query, "chat_history": chat_history})

In [20]:
result["answer"]

"\nFinding a new job can be a daunting task, especially in the current climate. My best advice is to start by updating your resume and LinkedIn profile to make sure they are up to date and reflect your current skills and experience. You should also reach out to your network of contacts to let them know you are looking for a new job and ask if they know of any opportunities. Additionally, you should look into job search websites such as Indeed and Craigslist, as well as staffing agencies that specialize in your field. Finally, don't forget to take advantage of any career counseling services that may be available to you. Good luck!"

In [27]:
result["source_documents"]

[Document(page_content="Would a staffing agency be able to provide me with stable income, even if any given employer doesn't decide to hire me after the temp period? It's entirely possible to still go without a job while with a staffing agency. Just depends on what they have available.", metadata={'source': '12vixm8', 'comment_id': 'jhc0br3', 'author': 'whotiesyourshoes'}),
 Document(page_content='Request: job search tips WAITING FOR THE RIGHT JOB WITH DECENT PAY > QUICK JOB', metadata={'source': '122mdcc', 'comment_id': 'jdv57nv', 'author': 'notenoughbeds'}),
 Document(page_content='Where to look for jobs? Online job boards? LinkedIn\n\nIndeed\n\nIf you need a... beginners job. Craigslist has a work section', metadata={'source': 'uj35l8', 'comment_id': 'i7gbuat', 'author': 'No-Statement-3019'}),
 Document(page_content='Job search tips in Canada You’re selling yourself way too cheap. Look for a senior position and then people will want to hire you more too', metadata={'source': '1156zs

In [22]:
result["chat_history"]

[]