In [1]:
import os
from dotenv import load_dotenv

load_dotenv()  # Loads variables from .env into environment

True

In [2]:
client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")
username = os.getenv("REDDIT_USERNAME")
password = os.getenv("REDDIT_PASSWORD")
api_key = os.getenv("OPENAI_API_KEY")

In [3]:
import praw

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password
)

In [4]:
subreddit_name = "iitkgp"  # Replace with your target subreddit
subreddit = reddit.subreddit(subreddit_name)

# Get all flair templates for posts (link flairs)
flairs = []
for flair in subreddit.flair.link_templates:
    flairs.append(flair['text'])

print(flairs)

['Bakar :doge:', 'Funda :snoo_dealwithit:', 'Request :illuminati:', 'ModPost :snoo_facepalm:', 'Photography 📷', 'KGP News 📰', 'AskKGP 👀', 'ShitPost💩', 'Subject Reviews &#10002;&#65039;', 'Non-Vague JEE Questions']


In [5]:
subreddit = "iitkgp"  # Replace with your target subreddit
min_upvotes = 10  # Set your upvote threshold
target_flairs = ["Funda :snoo_dealwithit:", "KGP News 📰", "Subject Reviews &#10002;&#65039;", "AskKGP 👀"]  # desired flairs for reliable content
top_n = 10 #only top 10 comments will be chosen

posts_with_top_comments = []
for submission in reddit.subreddit(subreddit).hot(limit=500):
    if submission.is_self and submission.score >= min_upvotes:
        submission.comment_sort = "top"
        submission.comments.replace_more(limit=0)
        top_comments = []
        for i, comment in enumerate(submission.comments):
            if i >= top_n:
                break
            if hasattr(comment, "body"):
                top_comments.append(comment.body)
        posts_with_top_comments.append({
            "title": submission.title,
            "text": submission.selftext,
            "score": submission.score,
            "url": submission.url,
            "subreddit": subreddit,
            "top_comments": top_comments
        })


In [6]:
def format_post(post):
    comments_text = "\n".join(post["top_comments"])
    return (
        f"Title: {post['title']}\n\n"
        f"Body: {post['text']}\n\n"
        f"Top Comments:\n{comments_text}"
    )

In [7]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
import os

text_splitter = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-small"), min_chunk_size=512)

reddit_chunks = []
for post in posts_with_top_comments:
    combined_text = format_post(post)
    docs = text_splitter.create_documents([combined_text])
    for doc in docs:
        doc.metadata.update({
            "title": post["title"],
            "subreddit": post["subreddit"],
            "url": post["url"],
            "score": post["score"]
        })
        reddit_chunks.append(doc)

In [8]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter 

# List your markdown file paths
markdown_files = ["C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/Fresher's_guide.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/gymkhana_constitution.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/insti_brotherhood_fund.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/insti_email_address.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/insti_facilities.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/insti_formalities.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/1st_yr_PG_instruction/1st_yr_PG_instruction.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/1st_yr_RS_instruction/conversion_results/1st_yr_RS_instruction/1st_yr_RS_instruction.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/1st_yr_UG_instruction/conversion_results/1st_yr_UG_instruction/1st_yr_UG_instruction.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/AcademicCalendar2025-26/conversion_results/AcademicCalendar2025-26/AcademicCalendar2025-26.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/PaymentInstructions/conversion_results/PaymentInstructions/PaymentInstructions.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/rules_&_regulations/rules_&_regulations.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/clubs_societies.md",
                  "C:/Users/tripa/OneDrive/Desktop/iitkgp_docs/conversion_results/food_spots.markdown"]


# Define which headers to split on
headers_to_split_on = [
                       ("#", "Header 1"),
                       ("##", "Header 2"),
                       ("###", "Header 3"),
                    ]

# Initialize the splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

#As we are loading data from multiple files, we need an all_chunks object containing data from each file
all_chunks = []

for file_path in markdown_files:
    with open(file_path, "r", encoding="utf-8") as f:
        markdown_text = f.read()


    # Split the markdown text into chunks (LangChain Document objects)
    chunks = markdown_splitter.split_text(markdown_text)
    # Further split each header chunk by character or token count
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=200)
    final_chunks = text_splitter.split_documents(chunks)
    for chunk in final_chunks:
        chunk.metadata["source_file"] = file_path
    all_chunks.extend(final_chunks)

In [9]:
len(all_chunks)

3949

In [10]:
len(reddit_chunks)

66

In [11]:
unique_contents = set()
unique_chunks = []

for doc in all_chunks:
    if doc.page_content not in unique_contents:
        unique_contents.add(doc.page_content)
        unique_chunks.append(doc)

In [12]:
unique_chunks.extend(reddit_chunks)

In [13]:
len(unique_chunks)

3647

In [14]:
from langchain_openai import OpenAIEmbeddings
import os

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Assuming `all_chunks` is your list of Document objects
# Embedding is handled automatically when adding to a vector store in LangChain

In [None]:
from langchain_chroma import Chroma

# Specify a directory for persistence (optional, but recommended)
persist_directory = "./chroma_db"

# Store your chunks in ChromaDB
vectorstore_2 = Chroma.from_documents(
    documents=unique_chunks,
    embedding=embeddings,
    persist_directory=persist_directory,  # Remove this line for in-memory only
    #collection_name="my_collection"       # Optional: name your collection
)

AttributeError: 'Chroma' object has no attribute 'persist'

In [16]:
query = "How to reach IIT KGP campus"
retrieved_docs = vectorstore_2.similarity_search(query, k=10)
for doc in retrieved_docs:
    print(doc.page_content)

#### **How to reach IIT KGP**
(placements, internships, FTs, competitions) from the students of the campus as well as alumni of the institute.  ===The Campus Tour=== A campus ride is being organized to acquaint freshers about the important places of the campus that a student needs to know. It begins from Technology Students' Gymkhana, IIT Kharagpur (TSG). In 2015, more than 1200 freshers in 28 separate buses were shown the magnificence of the KGP campus with engaging interactions of SWG team with freshers.  ===Information Brochure- A
| of IITKGP(if any)                                                                                                  |                                                                                                                                                                                                                                                                                                                   | correct information              

In [17]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.1)

In [18]:
retriever = vectorstore_2.as_retriever(search_kwargs={"k": 10})  # Adjust k as needed

In [19]:
from langchain_core.prompts import PromptTemplate

prompt_template = """Use the following context to answer the question.
If you don't know the answer, say you don't know.
Context: {context}
Question: {question}
Answer:"""

QA_PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [20]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",  # "stuff" is the default; alternatives: "map_reduce", "refine", "map_rerank"
    chain_type_kwargs={"prompt": QA_PROMPT}
)

In [21]:
query = "how did CodeClub embarass IIT?"
result = qa_chain({"query": query})
print(result["result"])

  result = qa_chain({"query": query})


CodeClub embarrassed IIT Kharagpur by hosting a Codeforces contest that was poorly managed, leading to a ranklist filled with suspiciously high performances from typically low-rated participants. This raised allegations of cheating and unfair practices, as many participants who usually struggle with simpler problems were suddenly solving complex ones with ease. The situation was exacerbated by the absence of high-rated participants like Masters, Candidate Masters, or Grandmasters, which further questioned the contest's credibility. The event reflected poorly on the club's ability to maintain fairness and uphold the competitive programming standards expected from a prestigious institution like IIT Kharagpur.


In [22]:
result_original = llm.invoke(query)
print(result_original)

content="I'm not aware of any specific incident where CodeClub embarrassed IIT. It's possible that you might be referring to a local or specific event that hasn't been widely reported or documented in the sources available to me. CodeClub is generally known for promoting coding and computer science education among young students, while IITs (Indian Institutes of Technology) are prestigious engineering institutions in India. If you have more details or context about the incident, I might be able to provide more information or clarification." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 94, 'prompt_tokens': 15, 'total_tokens': 109, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BpiiAklQCSfIfpBXRQe

In [23]:
query = "Who is the governor of TRAPs at IIT Kharagpur"
result_original = llm.invoke(query)
print(result_original)

content='As of my last update, the Technology Robotix Society (TRS) at IIT Kharagpur does not have a position specifically titled "governor of TRAPs." TRAPs, or Technology Robotix Society Annual Projects, are typically managed by the society\'s core team or coordinators. For the most current information, you may want to check the official website of the Technology Robotix Society or contact them directly through their official communication channels.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 92, 'prompt_tokens': 21, 'total_tokens': 113, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BpiiI03lpkLIpMiJtIGX4LrP5ylkg', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run-