In [1]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig
import torch
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [2]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer,
                device=0, 
                max_new_tokens=2000,
                do_sample=True, 
                top_k=20, 
                top_p=0.7,
                early_stopping=True,
                num_beams=2
               )
hf = HuggingFacePipeline(pipeline=pipe)

In [17]:
# template = """
# Create five {difficulty} quiz questions suitable for graduate students, focusing on advanced topics in {subject}. 
# Each question should present a significant challenge, aligning with master's level coursework, and should ideally incorporate real-world applications or data to contextualize the mathematical or statistical concepts involved. 
# Ensure the questions demand a deep understanding and application of theoretical principles, possibly involving multiple steps or the integration of several concepts.

# For each question, provide four choices, making sure the correct answer is included among them. 
# Additionally, include a detailed explanation for each answer, elaborating on the mathematical or statistical reasoning, the steps involved in arriving at the solution, and any relevant theoretical concepts that underpin the solution. 
# The explanations should serve not only to justify the correct answer but also to enhance understanding of the subject matter.

# Output Format:

# Question: [Insert question here, ensuring it requires application of advanced concepts]
# Choice1: [Insert choice here]
# Choice2: [Insert choice here]
# Choice3: [Insert choice here]
# Choice4: [Insert choice here]
# Answer: [Specify the correct choice Only]
# Explanation: [Provide a detailed explanation that covers the reasoning, the solution process, and relevant theoretical background]
# """


"""
Output Format

Question : [Insert Question Here]
Choice1 : [Insert Choice1 Here]
Choice2 : [Insert Choice2 Here]
Choice3 : [Insert Choice3 Here]
Choice4 : [Insert Choice4 Here]
Answer  : [Correct choice out of the 4 given choices]
Explanation : [Explanation of the correct choice]
"""

'\nOutput Format\n\nQuestion : [Insert Question Here]\nChoice1 : [Insert Choice1 Here]\nChoice2 : [Insert Choice2 Here]\nChoice3 : [Insert Choice3 Here]\nChoice4 : [Insert Choice4 Here]\nAnswer  : [Correct choice out of the 4 given choices]\nExplanation : [Explanation of the correct choice]\n'

In [21]:
# Define the prompt template with placeholders for difficulty and subject
# The questions should be mathematical and statistical as well.
template = """
Generate {num_questions} {difficulty} quiz questions for graduate students focused on the topic of {subject}. 
Each question should present a significant challenge, aligning with master's level coursework, and should ideally incorporate real-world applications or data to contextualize the mathematical or statistical concepts involved. 
Ensure the questions demand a deep understanding and application of theoretical principles, possibly involving multiple steps or the integration of several concepts.

The answer should definitely be one of the Choices.

Here is the desired JSON structure for each question:

Output Format:

Question : [Insert Question Here]
Choice1 : [Insert Choice1 Here]
Choice2 : [Insert Choice2 Here]
Choice3 : [Insert Choice3 Here]
Choice4 : [Insert Choice4 Here]
Answer  : [Correct choice out of the 4 given choices]
Explanation : [Explanation of the correct choice]
"""

# Create a PromptTemplate instance from the template
prompt_template = PromptTemplate.from_template(template)

# Example usage of the prompt template to generate a question
difficulty = "medium"  # You can vary this between "easy", "medium", "hard", etc.
subject = "Clustering"
num_questions = 4

chain = prompt_template | hf
# Use the prompt template with Langchain to generate the quiz
# Assuming you have a Langchain pipeline or chain set up as `langchain_pipeline`
generated_quiz = chain.invoke({
    "num_questions" : num_questions,
    "difficulty": difficulty,
    "subject": subject
})

# Print the generated quiz in JSON format
print(generated_quiz)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Question 1:
Question: Given a dataset of customer transactions, how would you determine the optimal number of clusters using the Elbow Method? Explain the intuition behind this method and how it can be applied to different distance metrics.

Choice1: By calculating the sum of squared errors (SSE) for a range of k values and plotting it against k, the optimal number of clusters is the point where the rate of decrease in SSE starts to level off.
Choice2: By calculating the silhouette score for a range of k values and selecting the value of k with the highest average silhouette score.
Choice3: By calculating the Dunn Index for a range of k values and selecting the value of k with the highest index value.
Choice4: By calculating the Davies-Bouldin Index for a range of k values and selecting the value of k with the lowest index value.
Answer: Choice1
Explanation: The Elbow Method is based on the idea that as the number of clusters (k) increases, the rate of decrease in the sum of squared e

In [23]:
generated_quiz = chain.invoke({
    "num_questions" : num_questions,
    "difficulty": difficulty,
    "subject": subject
})

# Print the generated quiz in JSON format
print(generated_quiz)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Question 1:
Question: Given a dataset of customer transactions, which clustering algorithm would you recommend to identify groups of customers with similar purchasing behavior, and why?
Choice1: DBSCAN
Choice2: K-Means
Choice3: Hierarchical Clustering
Choice4: Spectral Clustering
Answer: Choice2: K-Means
Explanation: K-Means is a centroid-based clustering algorithm, which works well when the clusters are spherical and well-separated. In this case, we are looking for groups of customers with similar purchasing behavior, which can often be modeled as spherical clusters.

Question 2:
Question: Consider the following dataset of 1000 points in a 2-dimensional space. Each point is represented by a pair of features (x, y). Given that the true number of clusters is unknown, which clustering algorithm would you recommend, and why?
Choice1: DBSCAN
Choice2: K-Means
Choice3: Hierarchical Clustering
Choice4: Spectral Clustering
Answer: Choice1: DBSCAN
Explanation: DBSCAN (Density-Based Spatial Clu

In [25]:
generated_quiz = chain.invoke({
    "num_questions" : num_questions,
    "difficulty": difficulty,
    "subject": subject
})

# Print the generated quiz in JSON format
print(generated_quiz)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Question 1:
Question: Given a dataset of customer transactions, identify the optimal number of clusters using the elbow method. The dataset includes features such as age, gender, income, and transaction amount. Which of the following plots would best represent the elbow point?
Choice1: A plot of the sum of squared errors (SSE) against the number of clusters.
Choice2: A plot of the log-likelihood against the number of clusters.
Choice3: A plot of the silhouette score against the number of clusters.
Choice4: A plot of the within-cluster variance against the number of clusters.
Answer: Choice1
Explanation: The elbow method identifies the optimal number of clusters based on the point where the decrease in SSE starts leveling off.

Question 2:
Question: Consider a dataset of 1000 points in a 2-dimensional space. You want to apply DBSCAN clustering with a radius of 1.5 and a minimum number of points in a cluster of 5. Which of the following statements is true?
Choice1: All points within a d

In [29]:
for _ in generated_quiz.split('\n'):
    print(_.strip())


Question 1:
Question: Given a dataset of customer transactions, identify the optimal number of clusters using the elbow method. The dataset includes features such as age, gender, income, and transaction amount. Which of the following plots would best represent the elbow point?
Choice1: A plot of the sum of squared errors (SSE) against the number of clusters.
Choice2: A plot of the log-likelihood against the number of clusters.
Choice3: A plot of the silhouette score against the number of clusters.
Choice4: A plot of the within-cluster variance against the number of clusters.
Answer: Choice1
Explanation: The elbow method identifies the optimal number of clusters based on the point where the decrease in SSE starts leveling off.

Question 2:
Question: Consider a dataset of 1000 points in a 2-dimensional space. You want to apply DBSCAN clustering with a radius of 1.5 and a minimum number of points in a cluster of 5. Which of the following statements is true?
Choice1: All points within a d

In [30]:
import json
import re

class QuizQuestionJSONFormatter:
    def __init__(self, text_input):
        self.text_input = text_input

    def parse_questions(self):
        # Split the text input into chunks based on the question pattern
        question_blocks = re.split(r'Question \d+:', self.text_input)[1:]
        return question_blocks

    def extract_choices(self, block):
        # Extract choices from the block
        choices_pattern = r'Choice\d+: ([^\n]+)'
        choices = re.findall(choices_pattern, block)
        return choices

    def format_json(self):
        # Parse each question block and format it into JSON
        question_blocks = self.parse_questions()
        formatted_questions = []

        for block in question_blocks:
            # Extract question, answer, and explanation using regular expressions
            question = re.search(r'([^\n]+)', block).group(1).strip()
            choices = self.extract_choices(block)
            answer = re.search(r'Answer: (Choice\d+)', block).group(1).strip()
            explanation = re.search(r'Explanation: ([^\n]+)', block).group(1).strip()

            # Build the JSON object
            question_json = {
                "Question": question,
                "Choices": choices,
                "Answer": answer,
                "Explanation": explanation
            }
            formatted_questions.append(question_json)

        return json.dumps(formatted_questions, indent=4)

In [31]:
ac = QuizQuestionJSONFormatter(text_input=generated_quiz)

In [34]:
x = json.loads(ac.format_json())

In [37]:
x[1]

{'Question': 'Question: Consider a dataset of 1000 points in a 2-dimensional space. You want to apply DBSCAN clustering with a radius of 1.5 and a minimum number of points in a cluster of 5. Which of the following statements is true?',
 'Choices': ['All points within a distance of 1.5 units from a given point will be assigned to the same cluster.',
  'All points within a distance of 1.5 units from a given point and having at least 5 neighboring points will be assigned to the same cluster.',
  'All points within a distance of 1.5 units from a given point and having at least 5 neighboring points with the same label will be assigned to the same cluster.',
  'All points within a distance of 1.5 units from a given point and having at least 5 neighboring points with different labels will be assigned to the same cluster.'],
 'Answer': 'Choice2',
 'Explanation': 'DBSCAN assigns points to the same cluster if they are within a given radius (1.5 units in this case) and have a minimum number of ne