In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install pymupdf

import fitz  # PyMuPDF
import google.generativeai as genai
import json

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [None]:
def extract_text_and_images(pdf_path):
    doc = fitz.open(pdf_path)
    pages_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        num_images = len(page.get_images(full=True))
        pages_data.append({"page_num": page_num + 1, "content": text, "num_images": num_images})

    return pages_data

In [None]:
pdf_path = "/content/drive/MyDrive/Literature Review_13th_Oct.docx.pdf"
pdf_data = extract_text_and_images(pdf_path)
print(pdf_data)

[{'page_num': 1, 'content': 'Literature Review:\nAssumptions:\n1. Proposal is completed and ready to submitted by 19th Oct 2023\n2. That’s means you have done initial literature review\nFind good research paper:\n1. Put the topic or keywords that you are planning to research in semantics scholar\nhttps://www.semanticscholar.org/\n2. So it will list all the research that is relevant to what you are looking for\n3. Filter for last 5 years\n4. Check for the credibility of the research paper\na. Journal – check for the impact score – higher the impact score it is better\nThe following journals are normally good:\n.IEEE journals\n.Science Direct journals\n.ACM journals\n.Elsevier Journal\ni.Web of Science\xa0\nb. H Index the author is applicable for both journal and conference proceedings\n.Check authors h index – most of the time author is the student who\njointly publishes theirs work with supervisor (coauthor) if both have\npoor h index better not to consider\xa0\ni.If H index of the aut

In [None]:
print(pdf_data[0]["content"])

Literature Review:
Assumptions:
1. Proposal is completed and ready to submitted by 19th Oct 2023
2. That’s means you have done initial literature review
Find good research paper:
1. Put the topic or keywords that you are planning to research in semantics scholar
https://www.semanticscholar.org/
2. So it will list all the research that is relevant to what you are looking for
3. Filter for last 5 years
4. Check for the credibility of the research paper
a. Journal – check for the impact score – higher the impact score it is better
The following journals are normally good:
.IEEE journals
.Science Direct journals
.ACM journals
.Elsevier Journal
i.Web of Science 
b. H Index the author is applicable for both journal and conference proceedings
.Check authors h index – most of the time author is the student who
jointly publishes theirs work with supervisor (coauthor) if both have
poor h index better not to consider 
i.If H index of the author is not good then check H Index of the
coauthor (most

In [None]:
from google import genai

def get_gemini_response(prompt):
    client = genai.Client(api_key="")

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
    )

    return(response.text)

In [None]:
# prompt = f"From the following text, extract the 5 most relevant keyword, and generate a concise summary (max 80 words) ensuring at least 4 of the extracted keywords appear in the page summary"
text = pdf_data[0]["content"]
prompt = f"From the following text, Generate a concise summary under 80 words relevant to the domain of the text and also axtract the 5 most relevant keywords ensuring at least 4 of the extracted keywords appear in the page summary as well. Provide the output as a JSON with the keys as summery and keywords. For example:{{'summary': 'extracted summary', 'keywords': ['keyword1', 'keyword2', ...]}} . Make sure to just return the JSON. Do not include any additional characters surround the JSON object Text:{text}"
response = get_gemini_response(prompt)
print(response)

```json
{
  "summary": "This guide outlines strategies for conducting a literature review. Key steps include using Semantic Scholar with relevant keywords, filtering for recent research, and assessing paper credibility. Credibility checks involve evaluating the journal's impact score (IEEE, Science Direct, ACM, Elsevier), author's H-index, citation count, and author affiliations. Selected papers should be assessed by reading the abstract, introduction, and conclusion to determine relevance before a full read.",
  "keywords": [
    "literature review",
    "research paper",
    "credibility",
    "H-index",
    "citations"
  ]
}
```


In [None]:
import json

def extract_json(text):
    start = text.find("{")
    end = text.rfind("}")

    if start == -1 or end == -1 or start > end:
        return None  # No valid JSON found

    json_str = text[start:end+1]

    try:
        return json.loads(json_str)  # Convert to dictionary
    except json.JSONDecodeError:
        return None

In [None]:
def retry_function(max_retries=3):
    for attempt in range(max_retries):
        response = get_gemini_response(prompt)
        if extract_json(text) is not None:
            return response
        print(f"Attempt {attempt + 1}: Invalid response, retrying...")
    return None

In [None]:
response_dictionary = extract_json(response)
print(response_dictionary)

{'summary': "This guide outlines strategies for conducting a literature review. Key steps include using Semantic Scholar with relevant keywords, filtering for recent research, and assessing paper credibility. Credibility checks involve evaluating the journal's impact score (IEEE, Science Direct, ACM, Elsevier), author's H-index, citation count, and author affiliations. Selected papers should be assessed by reading the abstract, introduction, and conclusion to determine relevance before a full read.", 'keywords': ['literature review', 'research paper', 'credibility', 'H-index', 'citations']}
