In [3]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig
import torch
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser, CommaSeparatedListOutputParser, PydanticOutputParser

In [4]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer,
                device=0, 
                max_new_tokens=2000,
                do_sample=True, 
                top_k=20, 
                top_p=0.7,
                early_stopping=True,
                num_beams=2
               )
hf = HuggingFacePipeline(pipeline=pipe)

In [6]:
"""
def get_output_parser():
    missing_id = ResponseSchema(
        name="identifiers",
        description="These are the missing identifiers.",
        type='List[string]'
    )
    missing_text = ResponseSchema(
         name="texts",
         description="These are the missing texts.",
         type='List[string]'
    )

    response_schemas = [missing_id, missing_text]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    return output_parser
"""

'\ndef get_output_parser():\n    missing_id = ResponseSchema(\n        name="identifiers",\n        description="These are the missing identifiers.",\n        type=\'List[string]\'\n    )\n    missing_text = ResponseSchema(\n         name="texts",\n         description="These are the missing texts.",\n         type=\'List[string]\'\n    )\n\n    response_schemas = [missing_id, missing_text]\n    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)\n    return output_parser\n'

https://stackoverflow.com/questions/78020395/how-to-specify-nested-json-using-langchain

In [16]:
question_schema = ResponseSchema(name='Question', description="Question Generated on the given topic")
choice1_schema = ResponseSchema(name='Choice1', description='Choice 1 for the given question')
choice2_schema = ResponseSchema(name='Choice2', description='Choice 2 for the given question')
choice3_schema = ResponseSchema(name='Choice3', description='Choice 3 for the given question')
choice4_schema = ResponseSchema(name='Choice4', description='Choice 4 for the given question')
answer_schema = ResponseSchema(name='Answer', description='One of the selected choices out of 4 choices given as the answer. Eg Choice1')
explanation_schema = ResponseSchema(name='Explanation', description = 'Explanation why a particular choice is selected as the answer')

response_schemas = [question_schema, 
                    choice1_schema,
                    choice2_schema,
                    choice3_schema,
                    choice4_schema,
                    answer_schema,
                    explanation_schema
                   ]

In [32]:
# quiz_format = [
#     {
#         "Question" : "Question Generated on the given topic",
#         "Choice 1" : "Choice 1 for the given question.", 
#         "Choice 2" : "Choice 2 for the given question",
#         "Choice 3" : "Choice 3 for the given question",
#         "Choice 4" : "Choice 4 for the given question",
#         "Answer" : "One of the choices among 4 choices which is best suited as the answer",
#         "Explanation" : "Brief exaplantion of why the selected choice is the correct answer"
#     }, 
# ]

In [33]:
# quiz_schema = ResponseSchema(name="Quiz",
#                              description=f"""{quiz_format}""")

# quiz_parser = StructuredOutputParser.from_response_schemas([quiz_schema])

In [17]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='Question', description='Question Generated on the given topic', type='string'), ResponseSchema(name='Choice1', description='Choice 1 for the given question', type='string'), ResponseSchema(name='Choice2', description='Choice 2 for the given question', type='string'), ResponseSchema(name='Choice3', description='Choice 3 for the given question', type='string'), ResponseSchema(name='Choice4', description='Choice 4 for the given question', type='string'), ResponseSchema(name='Answer', description='One of the selected choices out of 4 choices given as the answer. Eg Choice1', type='string'), ResponseSchema(name='Explanation', description='Explanation why a particular choice is selected as the answer', type='string')])

In [18]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

# format_instructions = quiz_parser.get_format_instructions()
# print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"Question": string  // Question Generated on the given topic
	"Choice1": string  // Choice 1 for the given question
	"Choice2": string  // Choice 2 for the given question
	"Choice3": string  // Choice 3 for the given question
	"Choice4": string  // Choice 4 for the given question
	"Answer": string  // One of the selected choices out of 4 choices given as the answer. Eg Choice1
	"Explanation": string  // Explanation why a particular choice is selected as the answer
}
```


In [10]:
# quiz_schema = ResponseSchema(name='Questions', 
#                              description="")

In [32]:
# Define the prompt template with placeholders for difficulty and subject
# The questions should be mathematical and statistical as well.
# Quiz with {num_questions}
template = """
Generate a {difficulty} MCQ type question for graduate students focused on the topic of {subject}. 
The question should present a significant challenge, aligning with master's level coursework, and should ideally incorporate real-world applications or data to contextualize the mathematical or statistical concepts involved. 
Ensure the question demand a deep understanding and application of theoretical principles, possibly involving multiple steps or the integration of several concepts.
Ensure that Question is strictly multiple choice question. 
The answer should definitely be one of the Choices.

{format_instructions}
"""

context = generated_quiz
# Create a PromptTemplate instance from the template
prompt_template = PromptTemplate.from_template(template)

# Example usage of the prompt template to generate a question
difficulty = "easy"  # You can vary this between "easy", "medium", "hard", etc.
subject = "History of India"
num_questions = 4

chain = prompt_template | hf | output_parser
# Use the prompt template with Langchain to generate the quiz
# Assuming you have a Langchain pipeline or chain set up as `langchain_pipeline`
generated_quiz = chain.invoke({
   # "num_questions" : num_questions,
    "difficulty": difficulty,
    "subject": subject,
    "format_instructions" : format_instructions
})

# Print the generated quiz in JSON format
print(generated_quiz)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'Question': 'Which event marked the beginning of the Mughal Empire in India?', 'Choice1': 'The Battle of Panipat (1526)', 'Choice2': 'The Battle of Haldighati (1576)', 'Choice3': 'The Battle of Plassey (1757)', 'Choice4': 'The Battle of Buxar (1764)', 'Answer': 'Choice1', 'Explanation': 'The Battle of Panipat in 1526 marked the beginning of the Mughal Empire in India when Babur defeated Ibrahim Lodhi.'}


In [31]:
generated_quiz

{'Question': 'Which dynasty is credited with the establishment of the Delhi Sultanate in India?',
 'Choice1': 'Maurya Dynasty',
 'Choice2': 'Gupta Dynasty',
 'Choice3': 'Rashtrakuta Dynasty',
 'Choice4': 'Sultanate of Delhi',
 'Answer': 'Choice3: Rashtrakuta Dynasty',
 'Explanation': "The Delhi Sultanate was established by Qutb-ud-din Aibak in 1206 AD, after the defeat of Prithviraj Chauhan in the Second Battle of Tarain. However, the Rashtrakuta Dynasty ruled over the Deccan region of India from 753 to 1274 AD. This question tests the student's knowledge of Indian history and their ability to distinguish between different dynasties and time periods."}

In [26]:
generated_quiz

{'Question': 'Given a dataset of 1000 customer records, each containing 5 features (age, gender, income, education, and occupation), apply clustering analysis to segment the customers into meaningful groups. Which clustering algorithm would you recommend and why?',
 'Choice1': 'DBSCAN with a radius of 2.5 and a minimum number of points of 5',
 'Choice2': 'K-Means with an optimal number of clusters determined using the Elbow method',
 'Choice3': "Hierarchical clustering using Ward's method",
 'Choice4': 'Fuzzy C-Means clustering with a membership degree of 0.7',
 'Answer': 'Choice2: K-Means with an optimal number of clusters determined using the Elbow method',
 'Explanation': 'The K-Means algorithm is a popular choice for clustering when the number of clusters is known beforehand. The Elbow method is used to determine the optimal number of clusters based on the WCSS (Within Clusters Sum of Squared Differences). This method helps to avoid the subjective selection of the number of cluster

In [40]:
def format_output(llm_output, output_parser):
    output_response = []
    pattern = r"```json\s+(.+?)\s+```"

    # Find all matches
    matches = re.findall(pattern, generated_quiz, re.DOTALL)
    
    # Print matches
    for match in matches:
        output_response.append(output_parser.parse(match))

    return output_response

In [41]:
final_quiz = format_output(llm_output=generated_quiz, output_parser=output_parser)

In [54]:
ans = final_quiz[1].get('Answer')
choice_list = [ final_quiz[1].get(f'Choice{idx}') for idx in range(1,5)]
for idx, choice in enumerate(choice_list):
    if choice == ans:
        print(idx+1)

2


In [50]:
final_quiz[0]

{'Question': 'Given a dataset of 1000 points in a 2-dimensional space, which clustering algorithm would you recommend for finding 10 clusters with a high degree of accuracy?',
 'Choice1': 'DBSCAN with an epsilon value of 0.5 and a minimum number of points in a cluster of 5',
 'Choice2': 'K-Means with an initial number of clusters of 10 and a maximum number of iterations of 100',
 'Choice3': "Hierarchical Clustering using Ward's method with a linkage distance of 1.5",
 'Choice4': 'Spectral Clustering with the normalized Laplacian matrix and a resolution parameter of 0.1',
 'Answer': 'Spectral Clustering with the normalized Laplacian matrix and a resolution parameter of 0.1',
 'Explanation': 'Spectral Clustering is known to be effective in finding clusters of arbitrary shapes and sizes. The normalized Laplacian matrix helps to preserve the local structure of the data, and a suitable resolution parameter can be used to determine the number of clusters.'}

In [1]:
x = [{'Question': "Given a product with a monthly churn rate of 5% and a monthly acquisition rate of 10%, what is the product's net growth rate?", 'Choice1': '1.5%', 'Choice2': '3.5%', 'Choice3': '5%', 'Choice4': '10%', 'Answer': 'Choice2: 3.5%', 'Explanation': 'Net growth rate is calculated by subtracting the churn rate from the acquisition rate: 10% - 5% = 3.5%.'}, {'Question': "Assuming a product has a monthly revenue of $10,000 and a monthly COGS (Cost of Goods Sold) of $5,000, what is the product's monthly gross profit?", 'Choice1': '2,500', 'Choice2': '5,000', 'Choice3': '7,500', 'Choice4': '10,000', 'Answer': 'Choice3: 7,500', 'Explanation': 'Gross profit is calculated by subtracting COGS from revenue: $10,000 - $5,000 = $7,500.'}, {'Question': "Given a product with a monthly revenue of $20,000, a monthly COGS of $10,000, and a monthly operating expense of $5,000, what is the product's monthly EBITDA (Earnings Before Interest, Taxes, Depreciation, and Amortization)?", 'Choice1': '5,000', 'Choice2': '7,500', 'Choice3': '10,000', 'Choice4': '15,000', 'Answer': 'Choice2: 7,500', 'Explanation': 'EBITDA is calculated by subtracting COGS and operating expenses from revenue: $20,000 - $10,000 - $5,000 = $7,500.'}, {'Question': "Assuming a product's monthly revenue follows a normal distribution with a mean of $15,000 and a standard deviation of $3,000, what is the probability that the product's monthly revenue will be less than $12,000?", 'Choice1': '0.0135', 'Choice2': '0.05', 'Choice3': '0.1587', 'Choice4': '0.6827', 'Answer': 'Choice2: 0.05', 'Explanation': "To find the probability that the product's monthly revenue will be less than $12,000, we need to calculate the z-score and then use a standard normal distribution table or calculator. The z-score is calculated as (X - μ) / σ, where X is the revenue we're interested in, μ is the mean, and σ is the standard deviation. In this case, X = $12,000, μ = $15,000, and σ = $3,000. The z-score is (12,000 - 15,000) / 3,000 = -0.5. Using a standard normal distribution table or calculator, we find that the probability of a z-score of -0.5 is approximately 0.05, or 5%."}, {'Question': "Given a product with a monthly revenue of $25,000, a monthly COGS of $12,000, a monthly operating expense of $8,000, and a monthly interest expense of $3,000, what is the product's monthly net income?", 'Choice1': '1,500', 'Choice2': '3,500', 'Choice3': '5,000', 'Choice4': '7,500', 'Answer': 'Choice2: 3,500', 'Explanation': 'Net income is calculated by subtracting COGS, operating expenses, and interest expenses from revenue: $25,000 - $12,000 - $8,000 - $3,000 = $3,500.'}]

In [2]:
for i in x:
    print(i)

{'Question': "Given a product with a monthly churn rate of 5% and a monthly acquisition rate of 10%, what is the product's net growth rate?", 'Choice1': '1.5%', 'Choice2': '3.5%', 'Choice3': '5%', 'Choice4': '10%', 'Answer': 'Choice2: 3.5%', 'Explanation': 'Net growth rate is calculated by subtracting the churn rate from the acquisition rate: 10% - 5% = 3.5%.'}
{'Question': "Assuming a product has a monthly revenue of $10,000 and a monthly COGS (Cost of Goods Sold) of $5,000, what is the product's monthly gross profit?", 'Choice1': '2,500', 'Choice2': '5,000', 'Choice3': '7,500', 'Choice4': '10,000', 'Answer': 'Choice3: 7,500', 'Explanation': 'Gross profit is calculated by subtracting COGS from revenue: $10,000 - $5,000 = $7,500.'}
{'Question': "Given a product with a monthly revenue of $20,000, a monthly COGS of $10,000, and a monthly operating expense of $5,000, what is the product's monthly EBITDA (Earnings Before Interest, Taxes, Depreciation, and Amortization)?", 'Choice1': '5,000