In [1]:
import pandas as pd

papers = pd.read_csv("papers.csv")


papers.head(5)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [2]:
paper_texts = papers["paper_text"]

In [None]:
example = paper_texts.iloc[0]
example

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

## Try simularity search

In [28]:
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model (small but effective)
model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_extract_sentences_with_context(text, query='future work', prv=1, nxt=1, top_k=5, threshold=0.5):
    # Split text into sentences using punctuation (.!?)
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Embed sentences and query
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarities
    #cosine_scores = util.cos_sim(query_embedding, sentence_embeddings)[0]
    cosine_scores = util.cos_sim(query_embedding, sentence_embeddings)[0].cpu().numpy()


    # Get sentences with scores above threshold, sorted by score
    top_results = np.argwhere(cosine_scores > threshold).flatten()
    sorted_results = sorted(top_results, key=lambda idx: cosine_scores[idx], reverse=True)[:top_k]

    results = []
    for idx in sorted_results:
        start_idx = max(idx - prv, 0)
        end_idx = min(idx + nxt + 1, len(sentences))

        context = {
            'similarity_score': float(cosine_scores[idx]),
            'previous_sentences': ' '.join(sentences[start_idx:idx]).strip(),
            'matched_sentence': sentences[idx].strip(),
            'next_sentences': ' '.join(sentences[idx + 1:end_idx]).strip()
        }

        results.append(context)

    return results


In [39]:
results = semantic_extract_sentences_with_context(example, query='future work', prv=2, nxt=2, threshold=0.4)
from pprint import pprint
pprint(results)


[{'matched_sentence': 'It is one of future subjects.',
  'next_sentences': '771\n'
                    '\n'
                    'OBSTACLE AVOIDING MOVEMENT\n'
                    'Various systems of camera type autonomous mobile robot '
                    'are reported flourishingly6-1O. The system made up by the '
                    'authors (Fig.',
  'previous_sentences': 'Hence, it is clever to\n'
                        'stop the learning when the recognition rate attains '
                        'some upper limit. To improve further\n'
                        'the recognition rate, we must consider the spelling '
                        'of words.',
  'similarity_score': 0.511627197265625},
 {'matched_sentence': 'Develop.',
  'next_sentences': '31-1 (1987), pp. 91-95.',
  'previous_sentences': 'M., "Feature Analysis for Symbol Recognition by '
                        'Elastic Matching," IBM J. Res.',
  'similarity_score': 0.4147437810897827}]


In [40]:
def join_context_sentences(results):
    paragraphs = []
    for res in results:
        # Combine previous, matched, and next sentences into one paragraph
        combined = ' '.join([
            res['previous_sentences'].replace('\n', ' ').strip(),
            res['matched_sentence'].replace('\n', ' ').strip(),
            res['next_sentences'].replace('\n', ' ').strip()
        ]).strip()
        
        # Remove extra whitespace and line breaks
        combined = re.sub(r'\s+', ' ', combined)
        paragraphs.append(combined)
        
    return paragraphs

paragraphs = join_context_sentences(results)

pprint(paragraphs)

['Hence, it is clever to stop the learning when the recognition rate attains '
 'some upper limit. To improve further the recognition rate, we must consider '
 'the spelling of words. It is one of future subjects. 771 OBSTACLE AVOIDING '
 'MOVEMENT Various systems of camera type autonomous mobile robot are reported '
 'flourishingly6-1O. The system made up by the authors (Fig.',
 'M., "Feature Analysis for Symbol Recognition by Elastic Matching," IBM J. '
 'Res. Develop. 31-1 (1987), pp. 91-95.']


In [31]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import os

In [45]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from transformers import logging

logging.set_verbosity_error()

def find_answer_text(question, text):
    model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
    model = BertForQuestionAnswering.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    input_ids = tokenizer.encode(question, text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Locate the first [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1  # Tokens in segment A (question)
    num_seg_b = len(input_ids) - num_seg_a  # Tokens in segment B (text)
    segment_ids = [0] * num_seg_a + [1] * num_seg_b  # Segment identifiers

    assert len(segment_ids) == len(input_ids)

    # Model inference
    output = model(
        torch.tensor([input_ids]),
        token_type_ids=torch.tensor([segment_ids])
    )

    # Extract answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = " ".join(tokens[answer_start:answer_end + 1])
    else:
        return "I am unable to find the answer to this question. Can you please ask another question?"

    formatted_answer = answer.capitalize()
    #print(f"\nQuestion:\n{question.capitalize()}")
    #print(f"\nAnswer:\n{formatted_answer}.")

    return formatted_answer

In [46]:
question = 'What is the future work the author is suggesting?'

answers = []
for prg in paragraphs:
    answers.append(find_answer_text(question,prg))

print(answers)


['Spelling of words', 'Feature analysis for symbol recognition by elastic matching']


# Full Pipeline

In [47]:
#paper_texts = papers["paper_text"]

question = 'What is the future work the author is suggesting?'

all_answers = []

for text in paper_texts:
    # Step 1: Semantic similarity extraction
    results = semantic_extract_sentences_with_context(
        text, query='future work', prv=2, nxt=2, threshold=0.4
    )

    # Step 2: Join context sentences into paragraphs
    paragraphs = join_context_sentences(results)

    # Step 3: Extract answer text using your QA function
    paper_answers = []
    for prg in paragraphs:
        answer = find_answer_text(question, prg)
        if answer:  # only append non-empty answers
            print(answer)
            paper_answers.append(answer.strip())

    # Aggregate answers per paper
    all_answers.append(paper_answers)

# Now, all_answers[i] contains future work statements from paper i.


Spelling of words
Feature analysis for symbol recognition by elastic matching
Future directions
It is focused on a challenging , practically important vision task
Error correlation and committee improvement
We consider them very encouraging for future work . because the method not only predict ##s the secondary structure , but also which strands actually binds to form , b - sheets , even a modest result may be an important step on the way to full 3d predictions
I am unable to find the answer to this question. Can you please ask another question?
All auto ##mata in this class are represent ##able ; perhaps there are other sub ##class ##es ( not identified by the theorem ) which rc ##c cannot represent
An extensive analysis
Predicting the future
Investigate important applications , such as recognition problems for speech and images
Predicting the future
Compare aspects of the network model with human performance
Applications where the rules change more gradually
Temporal difference algor

KeyboardInterrupt: 