In [1]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
sys.path.append('/mnt/d/Projects/papersurvey_tool/src/')

import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

from bert_score import BERTScorer
from rouge import Rouge
import pandas as pd

In [60]:
ref_summary = " While deep learning has enabled tremendous progress on text and image datasets, its superiority on tabular data is not clear.\nWe contribute extensive benchmarks of standard and novel deep learning methods as well as tree-based models such as XGBoost and Random Forests, across a large number of datasets and hyperparameter combinations.\nWe deﬁne a standard set of 45 datasets from varied domains with clear characteristics of tabular data and a benchmarking methodology accounting for both ﬁtting models and ﬁnding good hyperparameters.\nResults show that treebased models remain state-of-the-art on medium-sized data (∼10K samples) even without accounting for their superior speed.\nTo understand this gap, we conduct an empirical investigation into the differing inductive biases of tree-based models and Neural Networks (NNs).\nThis leads to a series of challenges which should guide researchers aiming to build tabular-speciﬁc NNs: 1. be robust to uninformative features, 2. preserve the orientation of the data, and 3. be able to easily learn irregular functions.\nTo stimulate research on tabular architectures, we contribute a standard benchmark and raw data for baselines: every point of a 20 000 compute hours hyperparameter search for each learner."
eval_summary = "Tree-based models, such as XGBoost and Random Forests, consistently outperform deep learning models on medium-sized tabular datasets, even without considering their speed advantage. The inductive biases of tree-based models, such as their ability to handle irregular patterns and uninformative features, contribute to their superior performance. The lack of established benchmarks for tabular data and the challenges related to regularization techniques hinder the performance of deep learning models. Therefore, developing tabular-specific neural networks is necessary to address these challenges."

In [2]:
from summarisation.summariser import PaperSummariser
file_path = "../example_paper1.pdf"
autosum = PaperSummariser()
final_summary = autosum.summarise(file_path)
full_doc = "\n".join(autosum.text_chunks)


In [4]:
print(final_summary['summary'])

Summary: The research concludes that tree-based learning models, such as XGBoost and Random Forests, consistently outperform neural networks on medium-sized tabular data, with superiority found in both their predictive performance and processing speed. 

Findings: 
- Empirical investigation and extensive benchmarking reveal that tree-based models remain the state of the art, outperforming neural networks in terms of robustness and accuracy, even with a high number of uninformative features.
- Despite attempts to improve deep learning algorithms, they still struggle to handle irregular target function patterns and numerous uninformative features.

Methods: Utilizing a standard set of 45 diverse datasets with clear tabular data characteristics, the research carried out extensive benchmarking. This process accounted for the fitting of models and involved careful optimization of hyperparameters.


In [8]:
def get_questions(text, n=5):

    closed_end_questions_template = """
    For the given text below, please generate {n} closed-ended question that can be answered by 'yes' or 'no'. 
    These questions should be related to the key facts of the text.
    Only return the questions in a JSON format.

    Text: {text}

    """
    prompt= closed_end_questions_template.format(n=n, text=text)

    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4",
        messages = [{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [14]:
def get_answers(text, questions):

    closed_end_answers_template = """
    You are given several questions separated by '\n\n' and a text. 
    Answer each question in 'yes', 'no', or 'idk'.
    For each qusetion, find one or two quotes from the text that are most relevant to answering the question, then print them in numbered order. 
    Quotes should be reletively short. 

    If there are no relevant quotes, print 'no quotes found'.

    Text: {text}

    Questions: {questions}

    For each question, the response should be in JSON with the question, the answer and the quotes included. 
    The final response should be a list of JSON objects.

    """
   
    prompt = closed_end_answers_template.format(text=text, questions=questions)
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4",
        messages = [{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [9]:
questions = get_questions(text=full_doc, n=5)
print(questions)
    

{"questions": [
    "Is the superiority of deep learning on tabular data clear?",
    "Can tree-based models remain state-of-the-art on medium-sized data even without accounting for their superior speed?",
    "Does deep learning struggle to learn irregular patterns of target functions?",
    "Are deep learning models improved by the addition of an embedding layer?",
    "Are tree-based models superior for all random search budgets?"]
}


In [16]:
questions_list = eval(questions)['questions']

In [17]:
questions_str = "\n\n".join(questions_list)

In [18]:
qa_pairs = get_answers(text=full_doc, questions=questions_str)

In [19]:
print(qa_pairs)

[
{
"question": "Is the superiority of deep learning on tabular data clear?", 
"answer": "no", 
"quotes": ["1. While deep learning has enabled tremendous progress on text and image datasets, its superiority on tabular data is not clear.", "2. We contribute extensive benchmarks of standard and novel deep learning methods as well as tree-based models such as XGBoost and Random Forests, across a large number of datasets and hyperparameter combinations."]
},

{
"question": "Can tree-based models remain state-of-the-art on medium-sized data even without accounting for their superior speed?", 
"answer": "yes", 
"quotes": ["1. Results show that treebased models remain state-of-the-art on medium-sized data (∼10K samples) even without accounting for their superior speed."]
},

{
"question": "Does deep learning struggle to learn irregular patterns of target functions?", 
"answer": "yes", 
"quotes": ["1. Neural networks struggle to learn irregular patterns of the target function, and their rotati

In [20]:
qa_pairs = get_answers(text=final_summary, questions=questions_str)

In [21]:
print(qa_pairs)

[{
"question": "Is the superiority of deep learning on tabular data clear?",
"answer": "no",
"quotes": ["1. The research concludes that tree-based learning models, such as XGBoost and Random Forests, consistently outperform neural networks on medium-sized tabular data, with superiority found in both their predictive performance and processing speed."]
},
{
"question": "Can tree-based models remain state-of-the-art on medium-sized data even without accounting for their superior speed?",
"answer": "yes",
"quotes": ["1. Tree-based models remain the state of the art, outperforming neural networks in terms of robustness and accuracy, even with a high number of uninformative features."]
},
{
"question": "Does deep learning struggle to learn irregular patterns of target functions?",
"answer": "yes",
"quotes": ["1. Despite attempts to improve deep learning algorithms, they still struggle to handle irregular target function patterns and numerous uninformative features."]
},
{
"question": "Are d