In [16]:
pip install pdfminer.six langchain-gigachat dotenv getpass

[31mERROR: Could not find a version that satisfies the requirement getpass (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for getpass[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [62]:
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams

def extract_text_with_pdfminer(pdf_path: Union[str, Path]) -> Optional[str]:
    """
    Extract text from PDF using pdfminer.six (slower but handles complex layouts better)
    
    Args:
        pdf_path: Path to PDF file
        
    Returns:
        Extracted text or None if extraction fails
    """
    try:
        laparams = LAParams(
            line_overlap=0.5,
            char_margin=2.0,
            line_margin=0.5,
            word_margin=0.1,
            boxes_flow=0.5,
            detect_vertical=True
        )
        return extract_text(pdf_path, laparams=laparams)
    except Exception as e:
        logger.error(f"PDFMiner extraction failed: {e}")
        return None

In [63]:
# input path for analysed article
article = '/home/ilya/jupiter_projects/Breastfeeding Is Not a Risk Factor for Mother-to-Child Transmission of Hepatitis B Virus.pdf'

In [64]:
article_text = extract_text_with_pdfminer(article)

In [65]:
article_text

'Breastfeeding Is Not a Risk Factor for Mother-to-Child\nTransmission of Hepatitis B Virus\nXiangru Chen1., Jie Chen2., Jian Wen3, Chenyu Xu3, Shu Zhang2, Yi-Hua Zhou4,5*, Yali Hu2,5*\n\n1 Faculty of Nursing, Nanjing Drum Tower Hospital, Nanjing Medical University, Jiangsu, China, 2 Department of Obstetrics and Gynecology, Nanjing Drum Tower\nHospital, Nanjing Medical University, Jiangsu, China, 3 Department of Obstetrics and Gynecology, Zhenjiang Fourth People’s Hospital, Jiangsu, China, 4 Departments of\nExperimental Medicine and Infectious Diseases, Nanjing Drum Tower Hospital, Nanjing University Medical School, Jiangsu, China, 5 Jiangsu Key Laboratory for Molecular\nMedicine, Nanjing University Medical School, Jiangsu, China\n\nAbstract\n\nBackground: Many clinicians do not encourage breastfeeding in hepatitis B virus (HBV) carriers, since HBV DNA can be\ndetected in breast milk and breast lesions may increase exposure of infants to HBV. The aim of this study was to determine\nwhet

In [72]:
import os
import json
from dotenv import find_dotenv, load_dotenv
from langchain_gigachat.chat_models.gigachat import GigaChat
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import getpass

load_dotenv(find_dotenv())

if "GIGACHAT_CREDENTIALS" not in os.environ:
    os.environ["GIGACHAT_CREDENTIALS"] = getpass.getpass("Введите ключ авторизации GigaChat API: ")

client = GigaChat(
    model="GigaChat-2-Max",
    verify_ssl_certs=False,
    credentials=os.getenv('GIGACHAT_CREDENTIALS')
)

def analyse_article(article_text):
    analysis_prompt = ChatPromptTemplate.from_messages([
        ("system", """
        You are a physics professor with cross-disciplinary expertise (biology to nanotechnology) 
        serving as editor for a prestigious scientific journal. Analyze articles with strict objectivity.
        
        Analysis Protocol:
        1. Generate 150-200 word summary
        2. Compare abstract with full text
        3. Check for ideological biases
        4. Detect logical fallacies
        5. Evaluate paradigm compliance
        
        Output in this EXACT structure:
        {{
            "summary": "Concise summary",
            "abstract_analysis": {{
                "match_score": "X%",
                "discrepancies": ["list of discrepancies"],
                "omissions": ["list of omissions"]
            }},
            "ideological_issues": {{
                "unsupported_claims": ["list of claims"],
                "speculative_statements": ["list of statements"]
            }},
            "logical_errors": {{
                "fallacies": ["list of fallacies"],
                "sections": ["section numbers"]
            }},
            "paradigm_compliance": {{
                "level": "High/Medium/Low",
                "anomalies": ["list of anomalies"]
            }},
            "recommendation": {{
                "decision": "Accept/Revise/Reject",
                "reasoning": "Technical justification"
            }}
        }}
        """),
        ("human", "Article text: {input_text}")
    ])
    
    output_parser = StrOutputParser()

    try:
        chain = analysis_prompt | client | output_parser
        result = chain.invoke({"input_text": article_text})
        
        return json.loads(result.strip())
        
    except json.JSONDecodeError:
        return {"error": "Failed to parse analysis results"}
    except Exception as e:
        return {"error": str(e)}


In [70]:
analysis_result = analyse_article(article_text)

In [71]:
print(json.dumps(analysis_result, indent=2, ensure_ascii=False))

{
  "summary": "A comprehensive study investigates whether breastfeeding increases the risk of mother-to-child transmission of Hepatitis B virus (HBV). Involving 546 children of HBV-infected mothers, the research compares breastfed versus formula-fed infants, accounting for various factors like maternal HBeAg status, infant vaccination, and hepatitis B immune globulin (HBIG) usage. Results indicate no association between breastfeeding and increased HBV transmission risk. Furthermore, breastfeeding does not hinder the immune response to hepatitis B vaccines.",
  "abstract_analysis": {
    "match_score": "95%",
    "discrepancies": [
      "Minor differences in phrasing and emphasis."
    ],
    "omissions": []
  },
  "ideological_issues": {
    "unsupported_claims": [],
    "speculative_statements": []
  },
  "logical_errors": {
    "fallacies": [],
    "sections": []
  },
  "paradigm_compliance": {
    "level": "High",
    "anomalies": []
  },
  "recommendation": {
    "decision": "Acc