In [1]:
%pip install -q -q -q PyMuPDF
%pip install transformers -U

import fitz 
import re
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

### Extract text from pdf
---

In [3]:
pdf_path = 'RAI SW - AI Labs - AI Engineer July 24.pdf'
text = extract_text_from_pdf(pdf_path)
print(text[:100])

Greetings from ResoluteAI.in!
Mandatory Tasks : Task 1, 2, 3 should be mandatorily completed.
Option


### Preprocess the text

In [4]:

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text


sentences = re.split(r'[.!?;]', text)

sentences = [preprocess_text(sentence) for sentence in sentences]
print(len(sentences))



70


In [5]:

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model_encoder = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def embed(query):
    encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model_encoder(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

embeddings = embed(sentences)
print(embeddings.shape)

torch.Size([70, 384])


In [6]:
query = "How to increase chance of selection?"

query_embedding = embed([query])
print(query_embedding.shape)

torch.Size([1, 384])


In [7]:
def top_sentences(query, embeddings, sentences):
    query_embedding = embed([query])
    similarities = F.cosine_similarity(query_embedding, embeddings).flatten()
    cutoff_score = 0.2
    top_idx = similarities.argsort(descending=True)[:8]
    return [sentences[i] for i in top_idx]

top_sentences(query, embeddings, sentences)

[' Also share your algorithms train accuracy and mention the reason behind choosing your algorithm',
 ' Approach for all tasks and reading of instructions will be given higher preference over accuracy and code',
 ' Datewise number of picking and placing activity done',
 ' However if stuck at any one task please move to next and complete as much as possible before the deadline',
 ' Need to complete all first three tasks mentioned',
 ' Good Luck ',
 ' Please use Operations Management',
 ' 5']

In [11]:
# %pip install google-generativeai

gemini_api_key = '#########'

"""
Install the Google AI Python SDK

$ pip install google-generativeai

See the getting started guide for more information:
https://ai.google.dev/gemini-api/docs/get-started/python
"""

import os

import google.generativeai as genai

genai.configure(api_key=gemini_api_key)

# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model_genai = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
  system_instruction="You will be provided with context from a PDF document. Analyze the context and respond to queries based on the information given.",
)

chat_session = model_genai.start_chat(
  history=[]
)


In [12]:
query = 'How many tasks does user needs to do?'
context = top_sentences(query, embeddings, sentences)

message = {
    "role" : "user",
    "parts" : [
        f"Context: {context}",
        f"query : {query}"
    ]
}

response = chat_session.send_message(message)
print(response)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({'candidates': [{'content': {'parts': [{'text': "The user needs to complete **5 tasks** in total. \n\nHere's the breakdown:\n\n* **Mandatory:** Tasks 1, 2, and 3 (3 tasks)\n* **Optional:** Any 2 tasks from tasks 4 to 11 (2 tasks) \n"}], 'role': 'model'}, 'finish_reason': 1, 'index': 0, 'safety_ratings': [{'category': 9, 'probability': 1, 'blocked': False}, {'category': 8, 'probability': 1, 'blocked': False}, {'category': 7, 'probability': 1, 'blocked': False}, {'category': 10, 'probability': 1, 'blocked': False}], 'token_count': 0, 'grounding_attributions': []}]}),
)


In [13]:
# %pip install markdown

import markdown

answer = response.candidates[0].content.parts[0].text
html_answer = markdown.markdown(answer)

print(html_answer)

<p>The user needs to complete <strong>5 tasks</strong> in total. </p>
<p>Here's the breakdown:</p>
<ul>
<li><strong>Mandatory:</strong> Tasks 1, 2, and 3 (3 tasks)</li>
<li><strong>Optional:</strong> Any 2 tasks from tasks 4 to 11 (2 tasks) </li>
</ul>
