In [102]:
import PyPDF2
import re
import os

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    Args:
        pdf_path (str): Path to the PDF file
    Returns:
        str: Extracted text from the PDF
    """
    extracted_text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n\n"
    return extracted_text.strip()

def clean_text(text):
    """
    Clean the text by removing numbers, special characters, and extra spaces.
    Args:
        text (str): Text to clean
    Returns:
        str: Cleaned text
    """
    text_without_numbers = re.sub(r'\d+', '', text)
    cleaned_text = re.sub(r'[^\w\s]', ' ', text_without_numbers)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def count_words(text):
    """
    Count the number of words in the given text.
    Args:
        text (str): Text to count words in
    Returns:
        int: Number of words
    """
    words = text.split()
    return len(words)

if __name__ == "__main__":
    pdf_file_path = "AriesDataset/Papers/P002.pdf"
    
    full_text = extract_text_from_pdf(pdf_file_path)
    
    # Clean the text
    cleaned_text = clean_text(full_text)
    
    # Count the words in the cleaned text
    word_count = count_words(cleaned_text)
    
    print(f"Total words after cleaning: {word_count}")

Total words after cleaning: 10688


In [103]:
prompt = """In assessing outputs, look for evidence of originality, significance and rigour and apply the
generic definitions of the starred quality levels as follows:
In assessing work as being 4* (quality that is world-leading in terms of originality, significance
and rigour), expect to see evidence of, or potential for, some of the following types of characteristics
across and possibly beyond its area/field:
a primary or essential point of reference;
of profound influence;
instrumental in developing new thinking, practices, paradigms, policies or audiences;
a major expansion of the range and the depth of research and its application;
outstandingly novel, innovative and/or creative.
In assessing work as being 3* (quality that is internationally excellent in terms of originality,
LLM's effectiveness with different settings and inputs significance and rigour but which falls short of the highest standards of excellence), expect to see
evidence of, or potential for, some of the following types of characteristics across and possibly
beyond its area/field:
an important point of reference;
of considerable influence;
a catalyst for, or important contribution to, new thinking, practices, paradigms, policies or
audiences;
a significant expansion of the range and the depth of research and its application;
significantly novel or innovative or creative.
In assessing work as being 2* (quality that is recognised internationally in terms of originality,
significance and rigour), expect to see evidence of, or potential for, some of the following types of
characteristics across and possibly beyond its area/field:
a recognised point of reference;
of some influence;
an incremental and cumulative advance on thinking, practices, paradigms, policies or audiences;
a useful contribution to the range or depth of research and its application.
In assessing work as being 1* (quality that is recognised nationally in terms of originality,
significance and rigour), expect to see evidence of the following characteristics within its area/
field:
an identifiable contribution to understanding without advancing existing paradigms of enquiry
or practice;
of minor influence.
Now if the score is 1 or 2 say unpublishable and if the score if 3 or 4 say that it is publishable along with a 50 word explaination"""

In [104]:
## setting up API key
from google import genai
from mistralai import Mistral
import cohere
import os

gemini_key = os.environ.get("GEMINI_API_KEY")
mistral_key = os.environ.get("MISTRAL_API_KEY")
cohere_key = os.environ.get("COHERE_API_KEY")

In [105]:
# from google import genai

# client = genai.Client(api_key=gemini_key)

# response = client.models.generate_content(
#     model="gemini-2.0-flash",
#     contents=f'"{prompt}"\n\n{cleaned_text} Now tell me if the paper is publishable or not and give a 50 word explanation',
# )

# print(response.text)

# import os
# from mistralai import Mistral

# model = "mistral-large-latest"

# client = Mistral(api_key=mistral_key)

# chat_response = client.chat.complete(
#     model= model,
#     messages = [
#         {
#             "role": "user",
#             "content": "What is the best French cheese?",
#         },
#     ]
# )
# print(chat_response.choices[0].message.content)


# import cohere

# co = cohere.ClientV2(api_key=cohere_key)

# res = co.chat(
#     model="command-a-03-2025",
#     messages=[
#         {
#             "role": "user",
#             "content": "Write a title for a blog post about API design. Only output the title text.",
#         }
#     ],
# )

# print(res.message.content[0].text)


In [106]:
def check_gemini(pdf):
    data = extract_text_from_pdf(pdf)
    data = clean_text(data)
    client = genai.Client(api_key=gemini_key)
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=f'"{prompt}"\n\n{data} give one word "Publishable" if you think this paper is publishable and "Unpublishable" if you think this paper is unpublishable and give a 100 word explanation why',
    )
    first_word = response.text.split()[0]
    rest = ' '.join(response.text.split()[1:])
    
    return first_word, rest

def check_mistral(pdf):
    data = extract_text_from_pdf(pdf)
    data = clean_text(data)
    model = "mistral-large-latest"
    client = Mistral(api_key=mistral_key)
    
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": f'{data}\n\nGive one word "Publishable" if you think this paper is publishable and "Unpublishable" if you think this paper is unpublishable, and give a 100-word explanation why.',
            },
        ]
    )
    chat = chat_response.choices[0].message.content.strip()
    words = chat.split()
    first_word = words[0]
    rest = ' '.join(words[1:])
    return first_word, rest

def check_cohere(pdf):
    data = extract_text_from_pdf(pdf)
    data = clean_text(data)
    co = cohere.ClientV2(api_key=cohere_key)
    res = co.chat(
        model="command-a-03-2025",
        messages=[
            {
                "role": "user",
                "content": f'{data}\n\nGive one word "Publishable" if you think this paper is publishable and "Unpublishable" if you think this paper is unpublishable, and give a 100-word explanation why.',
            }
        ],
    )
    chat = res.message.content[0].text.strip()
    first_word = chat.split()[0]
    rest = ' '.join(chat.split()[1:])
    return first_word, rest

def final_check(string,para1,para2,para3):
    client = genai.Client(api_key=gemini_key)
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=f'Using the following points\n \n{para1}\n{para2}\n{para3}\n explain why the given the paper is {string} in 100 words',
    )
    return response.text


In [107]:
check_gemini("AriesDataset/Reference/Non-Publishable/R002.pdf")

('Unpublishable',
 'This paper is unpublishable due to its lack of scientific rigor and coherence. While it uses complex terminology, the connections drawn between various concepts (photosynthesis, quantum mechanics, culinary arts, etc.) are arbitrary and nonsensical. The methodology and results sections describe experiments that are fantastical and lack any grounding in reality. The overall paper reads as a parody of scientific writing rather than a serious investigation, failing to provide any meaningful contribution to the field of photosynthesis or any other area of study.')

In [108]:
check_mistral("AriesDataset/Reference/Non-Publishable/R002.pdf")

('Unpublishable.',
 'This paper is a surreal and humorous pastiche of scientific jargon and absurd hypotheses, mixing real scientific concepts with completely fabricated and nonsensical ideas. It lacks coherence, logical structure, and any semblance of serious academic inquiry. The inclusion of whimsical elements like "quokkas," "velociraptor shaped cookies," and "flumplenook theory" makes it clear that the paper is not intended to be a genuine contribution to scientific literature. It reads more like a piece of comedic fiction than a scholarly article.')

In [109]:
check_cohere("AriesDataset/Reference/Non-Publishable/R002.pdf")

('Unpublishable.',
 "This paper is unpublishable due to its incoherent structure, nonsensical content, and lack of scientific rigor. It presents a convoluted narrative that jumps between unrelated topics, including photosynthesis, quantum mechanics, culinary arts, and interdimensional communication, without any logical connection or meaningful analysis. The text is filled with absurd claims, fabricated methodologies, and irrelevant references, making it impossible to discern any credible scientific contribution. The use of humor and absurdity, while creative, undermines the paper's credibility and does not align with the standards of academic publishing. It lacks a clear hypothesis, methodology, results, and conclusion, rendering it unsuitable for publication in any reputable scientific journal.")

In [None]:
import os

False_Positives = 0
False_Negatives = 0
True_Positives = 0
True_Negatives = 0

def run_ensemble():
    global False_Positives, False_Negatives, True_Positives, True_Negatives

    for docs in os.listdir("AriesDataset/Reference/Non-Publishable"):
        pdf = os.path.join("AriesDataset/Reference/Non-Publishable", docs)
        print(pdf)
        first_word_gem, rest_gem = check_gemini(pdf)
        first_word_mis, rest_mis = check_mistral(pdf)
        first_word_coh, rest_coh = check_cohere(pdf)

        unpublishable_count = 0
        if first_word_gem[0] == "U":
            unpublishable_count += 1
        if first_word_mis[0] == "U":
            unpublishable_count += 1
        if first_word_coh[0] == "U":
            unpublishable_count += 1
        
        if unpublishable_count >= 2:
            True_Negatives += 1
        else:
            False_Positives += 1

    for docs in os.listdir("AriesDataset/Reference/Publishable"):
        pdf = os.path.join("AriesDataset/Reference/Publishable", docs)
        print(pdf)
        first_word_gem, rest_gem = check_gemini(pdf)
        first_word_mis, rest_mis = check_mistral(pdf)
        first_word_coh, rest_coh = check_cohere(pdf)

        publishable_count = 0
        if first_word_gem[0] == "P":
            publishable_count += 1
        if first_word_mis[0] == "P":
            publishable_count += 1
        if first_word_coh[0] == "P":
            publishable_count += 1
        
        if publishable_count >= 2:
            True_Positives += 1
        else:
            False_Negatives += 1

run_ensemble()

# Metrics
precision = True_Positives / (True_Positives + False_Positives) if (True_Positives + False_Positives) > 0 else 0
recall = True_Positives / (True_Positives + False_Negatives) if (True_Positives + False_Negatives) > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

AriesDataset/Reference/Non-Publishable/R001.pdf
AriesDataset/Reference/Non-Publishable/R003.pdf
AriesDataset/Reference/Non-Publishable/R002.pdf
AriesDataset/Reference/Non-Publishable/R005.pdf
AriesDataset/Reference/Non-Publishable/R004.pdf
AriesDataset/Reference/Publishable/R014.pdf
AriesDataset/Reference/Publishable/R015.pdf
AriesDataset/Reference/Publishable/R012.pdf
AriesDataset/Reference/Publishable/R006.pdf
AriesDataset/Reference/Publishable/R007.pdf
AriesDataset/Reference/Publishable/R013.pdf
AriesDataset/Reference/Publishable/R011.pdf
AriesDataset/Reference/Publishable/R010.pdf
AriesDataset/Reference/Publishable/R009.pdf
AriesDataset/Reference/Publishable/R008.pdf
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000


In [110]:
def predict(pdf):
    print(pdf)
    first_word_gem, rest_gem = check_gemini(pdf)
    first_word_mis, rest_mis = check_mistral(pdf)
    first_word_coh, rest_coh = check_cohere(pdf)

    publishable_count = 0
    string = "Unpublishable"
    if first_word_gem[0] == "P":
        publishable_count += 1
    if first_word_mis[0] == "P":
        publishable_count += 1
    if first_word_coh[0] == "P":
        publishable_count += 1

    if publishable_count>=2:
        string = "Publishable"

    para = final_check(string,rest_gem, rest_mis, rest_coh)

    return string,para 

In [111]:
import pandas as pd

if __name__ == "__main__":
    results = []

    folder_path = "AriesDataset/Papers"
    for doc in os.listdir(folder_path):
        if doc.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, doc)
            status, paragraph = predict(pdf_path)
            results.append({
                "Filename": doc,
                "Status": status,
                "Paragraph": paragraph
            })

    df = pd.DataFrame(results)
    df.to_csv("prediction_results.csv", index=False)
    print("Saved predictions to prediction_results.csv")

AriesDataset/Papers/P005.pdf
AriesDataset/Papers/P011.pdf
AriesDataset/Papers/P039.pdf
AriesDataset/Papers/P038.pdf
AriesDataset/Papers/P010.pdf
AriesDataset/Papers/P004.pdf
AriesDataset/Papers/P012.pdf
AriesDataset/Papers/P006.pdf
AriesDataset/Papers/P007.pdf
AriesDataset/Papers/P013.pdf
AriesDataset/Papers/P017.pdf
AriesDataset/Papers/P003.pdf
AriesDataset/Papers/P002.pdf
AriesDataset/Papers/P016.pdf
AriesDataset/Papers/P028.pdf
AriesDataset/Papers/P014.pdf
AriesDataset/Papers/P015.pdf
AriesDataset/Papers/P001.pdf
AriesDataset/Papers/P029.pdf
AriesDataset/Papers/P099.pdf
AriesDataset/Papers/P066.pdf
AriesDataset/Papers/P072.pdf
AriesDataset/Papers/P112.pdf
AriesDataset/Papers/P106.pdf
AriesDataset/Papers/P107.pdf
AriesDataset/Papers/P113.pdf
AriesDataset/Papers/P073.pdf
AriesDataset/Papers/P067.pdf
AriesDataset/Papers/P098.pdf
AriesDataset/Papers/P071.pdf
AriesDataset/Papers/P065.pdf
AriesDataset/Papers/P059.pdf
AriesDataset/Papers/P105.pdf
AriesDataset/Papers/P111.pdf
AriesDataset/P

In [118]:
import pandas as pd 
data = pd.read_csv("prediction_results.csv")

data.head()

Unnamed: 0,Filename,Status,Paragraph
0,P005.pdf,Publishable,This paper is publishable due to its novel and...
1,P011.pdf,Publishable,This paper offers a novel and comprehensive so...
2,P039.pdf,Unpublishable,The paper is unpublishable due to its incohere...
3,P038.pdf,Unpublishable,The paper is unpublishable due to its blend of...
4,P010.pdf,Publishable,"This paper introduces MB-CAL, a novel reinforc..."


In [119]:
# replace header Filename with Paper ID
data.rename(columns={'Filename': 'Paper ID'}, inplace=True)
data.rename(columns={'Status': 'Publishable'}, inplace=True)
data.rename(columns={'Paragraph': 'Explanation'}, inplace=True)

# remove the last 4 characters from Paper ID
data['Paper ID'] = data['Paper ID'].str[:-4]

# map Publishable to 1 and Unpublishable to 0
data['Publishable'] = data['Publishable'].map({'Publishable': 1, 'Unpublishable': 0})
data.to_csv("results.csv", index=False)