In [1]:
########################################### Step 1
import subprocess
import os
import re
import pandas as pd
from pdfminer.high_level import extract_text
from transformers import pipeline

# Define output folder and file path
output_folder = "output"
output_file = os.path.join(output_folder, "readme.txt")

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Summarization pipelines (initialize once for efficiency)
summarizer_fb = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer_google = pipeline("summarization", model="google/pegasus-cnn_dailymail")



pdf_folder = "./pdf_folder"
data = []

for file_name in os.listdir(pdf_folder):
    if file_name.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, file_name)
        print(f"📄 Processing {file_name}...")

        claims_text = None
        try:
            # Extract text
            full_text = extract_text(pdf_path).replace("\n", " ")

            match = re.search(
                r'('
                r'(?:I\s*/\s*)?We\s+Claim\s*:|'            # I/We Claim:
                r'I\s+Claim\s*:|'                         # I Claim:
                r'We\s+Claim\s*:|'                        # We Claim:
                r'Claims\s*:|'                            # Claims:
                r'What\s+(?:is\s+)?claimed\s+is\s*:|'     # What is claimed is:
                r'What\s+we\s+claim\s+is\s*:'
                r')\s*(.*)',                              # capture everything after the heading
                full_text,
                re.IGNORECASE | re.DOTALL
            )

            if match:
                claims_text = match.group(2).strip()
                claims_text = re.sub(r'\s+', ' ', claims_text).strip()
            else:
                print(f"⚠️ Claims section not found in {file_name}.")
                claims_text = None

            # Summarize only if claims were found
            if claims_text:
                summary_fb = summarizer_fb(claims_text[:512], max_length=512, min_length=10, do_sample=False)
                claims_text_fb = summary_fb[0]['summary_text']

                # summary_google = summarizer_google(claims_text[:512], max_length=512, min_length=10, do_sample=False)
                # claims_text_google = summary_google[0]['summary_text'].replace('<n>', '')

            else:
                claims_text_fb = None
                # claims_text_google = None

            # Append results for this file
            data.append({
                'file_name': file_name,
                'claims_text_fb': claims_text_fb
                # ,
                # 'claims_text_google': claims_text_google
            })

        except Exception as e:
            print(f"❌ Error processing {file_name}: {e}")
            data.append({
                'file_name': file_name,
                'claims_text_fb': None
                #,
                #'claims_text_google': None
            })

# Create DataFrame from collected data
df = pd.DataFrame(data)

print("\n✅ Finished processing all PDFs.")
# print(df.head())

############################################# Step 2
# Optional: save to CSV or Excel
df.to_csv("./output/claims_summaries.csv", index=False)
print("Saved to claims_summaries.csv")
# df.to_excel("claims_summaries.xlsx", index=False)

############################################# Step 3

from transformers import pipeline
# Load your classifier pipeline
classifier = pipeline(
    "text-classification",
    model="./bert-claims-fg2",
    tokenizer="./bert-claims-fg2"
)

# Create a new column for predictions
results = []

for idx, row in df.iterrows():
    text = row['claims_text_fb']
    classification = classifier(text)[0]  # classifier returns a list of dicts
    results.append(classification)

# Add predictions to the DataFrame
df['fb_prediction_label'] = [r['label'] for r in results]
df['fb_prediction_score'] = [r['score'] for r in results]


# # Apply threshold: set final label 0 if score < 0.55, else 1
# threshold = 0.55
# df['fb_prediction_label'] = df['fb_prediction_score'].apply(lambda s: 0 if s < threshold else 1)

# Apply threshold logic:
df['fb_prediction_label'] = df['fb_prediction_label'].where(
    df['fb_prediction_score'] >= 0.58,
    'Non-Relevant'  # overwrite label if score < 0.55
)

df['fb_prediction_label'] = df['fb_prediction_label'].where(
    df['fb_prediction_score'] < 0.58,
    'Relevant'  # overwrite label if score < 0.55
)


print(df[['file_name', 'fb_prediction_label', 'fb_prediction_score']])



  from .autonotebook import tqdm as notebook_tqdm



📄 Processing IN202311060784(A)_Relavant.pdf...


Your max_length is set to 512, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


📄 Processing IN479454(B)_NonRelavant.pdf...


Your max_length is set to 512, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)



✅ Finished processing all PDFs.
Saved to claims_summaries.csv
                        file_name fb_prediction_label  fb_prediction_score
0  IN202311060784(A)_Relavant.pdf            Relevant             0.712562
1     IN479454(B)_NonRelavant.pdf        Non-Relevant             0.560761
