In [46]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv

In [71]:
from IPython.display import Markdown

In [47]:
load_dotenv()

True

In [33]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob= "*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [34]:
extracted_data = load_pdf_files('/Users/anshugangwar/Desktop/Anshu/upsc_sociology_book_chatbot/data/')

Ignoring wrong pointing object 99 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Ignoring wrong pointing object 102 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)
Ignoring wrong pointing object 104 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 141 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 143 0 (offset 0)
Ignoring wrong pointing object 144 0 (offset 0)
Ignoring wrong pointing object 145 0 (offset 0)
Ignoring wrong pointing object 146 0 (offset 0)
Ignoring wrong pointing object 148 0 (offset 0)
Ignoring wrong pointing object 175 0 (offset 0)
Ignoring wrong pointing object 176 0 (offset 0)
Ignoring wrong pointing object 177 0 (offset 0)
Ignoring wrong pointing object 178 0 (offset 0)
Ignoring wrong pointing object 179 0 (off

In [35]:
len(extracted_data)

4153

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document])-> List[Document]:

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [37]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [38]:
len(minimal_docs)

4153

In [16]:
# split the documents

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
        length_function = len
    )

    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [27]:
def sematic_text_split(minimal_docs):
    text_splitter = SemanticChunker(
        HuggingFaceEmbeddings(),
        breakpoint_threshold_type="standard_deviation",
        breakpoint_threshold_amount = 1
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)

    return texts_chunks

In [39]:
texts_chunk = text_split(minimal_docs)

In [40]:
# sem_text_split = sematic_text_split(minimal_docs)

In [41]:
len(texts_chunk)

28324

In [20]:
from langchain.embeddings import HuggingFaceEmbeddings

In [42]:

embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    

In [52]:
vc = embedding.embed_documents([
    "Delhi is the capital of India.",
    "Beijing is the capital of China.",
    "Tokyo is the capital of Japan."
])

In [55]:
len(vc[0])

384

In [None]:
embedding

In [49]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone()

In [50]:
index_name = "upsc-sociology-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension= 384,
        metric = 'cosine',
        spec= ServerlessSpec(cloud='aws', region='us-east-1') 
    )

In [51]:
index = pc.Index(index_name)

In [57]:
from langchain_pinecone import PineconeVectorStore

In [58]:
docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [59]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs={"k":3})

In [60]:
retrieved_docs = retriever.invoke("Critically explain the salient features of ‘alienation’ as propounded by Karl Marx.")

In [61]:
retrieved_docs

[Document(id='ad20adc3-68c3-4253-a2d3-f2c8309bf322', metadata={'source': '/Users/anshugangwar/Desktop/Anshu/upsc_sociology_book_chatbot/data/Sociology-Tusharanshu Part 1 (UPSCPDF.com).pdf'}, page_content='alienation’. \nIn his Economic and Philosophical Manuscripts (EPM)\n published in 1844, Marx analyses various \naspects of alienation. \n1) Firstly, the worker is alienated from the produc\nt of his labour. The product in which he expresses \nand realises himself does not belong to him. It is \nappropriated by the capitalists and sold on the \nmarket. With realisation of surplus-value capital g\nrows, and with capital the alien power which \ncontrols and dominates the life of the worker. The'),
 Document(id='b979cdc9-b191-49dc-af23-e58440431114', metadata={'source': '/Users/anshugangwar/Desktop/Anshu/upsc_sociology_book_chatbot/data/Sociology-Tusharanshu Part 1 (UPSCPDF.com).pdf'}, page_content='78 \n \nKARL MARX and ALIENATION \n \nAlienation literally means “separation from”.The co\

In [63]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm1 = HuggingFaceEndpoint(
    repo_id="google/gemma-2-2b-it",
    task='text-generation'
)
model = ChatHuggingFace(llm=llm1)

In [64]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [66]:
system_prompt = (
    """You are an assisstant for question-answering tasks. Use the following pieces of retrieved context to 
    answer the question. If you don't know the answer, say that you don't Know. Use 200 words maximum and keep
    answer to the point.\n\n
    {context}""")

In [67]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [68]:
question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [69]:
response = rag_chain.invoke({"input": "Critically explain the salient features of alienation as propounded by Karl Marx."})

In [73]:
Markdown(response['answer'])

Karl Marx's concept of alienation highlights the painful existence of workers within capitalist societies.  He viewed this alienation as an inherent aspect of modern capitalism, stemming from the disconnect between labor and the societal context within economic opportunity.

Here are the core features of Marx's concept of alienation:

* **Alienation from the Product of Labor:** Workers produce goods that are owned by someone else, depriving them of ownership over the fruits of their labor.
* **Alienation From the Process of Production:** The reality of production in question is an abstraction, meaning individuals lack a connection to what they produced. 
* **Alienation From the Laborer Himself:** He sees the worker to be estranged from their own capacity for valuation and conscientiousness when disconnecting from their inherent human capabilities.
* **Alienation from Others:**  The capitalist system compels individuals to work right away at the expense of healthy, meaningful interactions, displacing genuine connections inside of work and in life.


These cracks permeate the classical workings of the human experience, rendering individuals as mere ciphers of the machine in this relationship, and exists to justify a system of exploitation. However, he suggested that such alienation would be resolved in a communist society, through worker cooperatives and collective ownership of the means of production, ushering in a new era of authentic for humans to thrive. 

**Criticism:**  Marx' expression of alienation is, certainly, a powerful lens for understanding how capitalism reshapes the human spirit. Some criticize the notion that it is entirely deterministic. Others suggest that many aspects of a capitalist society do not typically create heightened anxieties and are, in fact, largely avoided. 


In [74]:
response = rag_chain.invoke({"input": "Describe various characteristics of a social fact. How is rate of suicide a social fact according to Durkheim"})

In [75]:
Markdown(response['answer'])

Emile Durkheim viewed social realities as encompassing established norms, beliefs, and behaviours that shape the phenomena of human interaction and society. Here's how they are characterized: 

**Characteristics of Social Facts:**

* **Consistently Occurring**: Social facts are not merely individual experiences but a collective phenomenon that unfolds across numerous individuals within a given society.
* **Irrational in Nature**: They do not imply personal desires or internal motivations; they represent facts understood through the statistical patterns of social aggregates. Social facts change gradually with societal structures and values, just like political systems, economic relations, or religious ideologies. 
* **External to Individuals**: Social facts impose upon individuals, influencing their actions and thoughts but it is the collective nature that brings them into existence.   

**Rate of Suicide and Social Fact Status:**

According to Durkheim, the rate of suicide is a social fact. This is because:

* **Statistical Pattern**: While motivations might vary, the  suicide rate is neither a simple individual matter nor a random occurrence. Statistics show a significant and recurring transmission of self-destructive behaviour that transcends individual psychology. 
* **Social Consequence**:  Durkheim points out that even though we have personal beliefs and perceptions regarding the morality of suicide, the increasing rates in certain societies act as a social intrusion. 
* **Integrative Potential**: Every society has a 'normal' (or often a 'high') rate of suicide, allowing social groups to adhere to binding norms and organizations within the social fact cornerstone.   


However it is important to note that Durkheim acknowledged that psychological factors likely contribute to suicidal behaviour; but a primary focus is the societal implications, illustrating the intricate balance between individual psychology, and the social context.  


In [76]:
import re

def clean_chatbot_output(text):
    """
    Cleans chatbot output by removing markdown and special characters.

    Args:
        text (str): The raw text from the chatbot model.

    Returns:
        str: The cleaned, readable text.
    """
    # Remove asterisks used for bolding or lists
    text = re.sub(r'\*', '', text)
    # Remove bullet points (e.g., '-', '*') and extra spaces
    text = re.sub(r'•\s*|\-\s*', '', text)
    # Remove excessive newlines, keeping only a single line break between paragraphs
    text = re.sub(r'\n\s*\n', '\n\n', text.strip())
    # Remove any leading/trailing whitespace
    text = text.strip()
    return text

# Example of how to use the function
raw_output = """
According to Durkheim, various characteristics define a social fact: * **Measurable:** Social facts, like births, deaths, and marriages, can be statistically measured. This means we can collect data to understand the trends and variations in them. * **Objective:** Socially relevant concepts, like suicide rates, exist independently of individual thoughts or desires. * **Universal:** They tend to exist and display consistent patterns across different societies, cultures, and periods of time. This means what's considered normal in one society might be different in another. * **Adaptive:** They are viewed as an organic organization that reflects how humans create and sustain their social life. Durkheim argues that suicide rates, even though it might be viewed as an "immoral" act, must be analyzed from a broader societal perspective. This is because suicide rates present, according to him, a form of collective anxiety and are reflective of of social integration and regulation. Essentially, a social fact is relatively autonomous of individual beliefs, and can inform us about larger social forces that shape human behavior.
"""

cleaned_output = clean_chatbot_output(raw_output)
print(cleaned_output)

According to Durkheim, various characteristics define a social fact:  Measurable: Social facts, like births, deaths, and marriages, can be statistically measured. This means we can collect data to understand the trends and variations in them.  Objective: Socially relevant concepts, like suicide rates, exist independently of individual thoughts or desires.  Universal: They tend to exist and display consistent patterns across different societies, cultures, and periods of time. This means what's considered normal in one society might be different in another.  Adaptive: They are viewed as an organic organization that reflects how humans create and sustain their social life. Durkheim argues that suicide rates, even though it might be viewed as an "immoral" act, must be analyzed from a broader societal perspective. This is because suicide rates present, according to him, a form of collective anxiety and are reflective of of social integration and regulation. Essentially, a social fact is rel