In [14]:
import os
import re
import json
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
from transformers import pipeline
from config import data_path

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
    )

In [9]:
# Get all MMDs

def grep_from_folder(folder_path, pattern):
    matched_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if re.search(pattern, file):
                matched_files.append(os.path.join(root, file))
    return matched_files


txt_files = grep_from_folder(data_path, ".mmd")
# print(txt_files)

In [10]:
def get_citation_sentences(text, ref) -> list:
    # Split text into sentences using NLTK
    sentences = sent_tokenize(text)

    # Filter sentences that contain ref
    sentences_with_ref = [sentence for sentence in sentences if ref in sentence]
    return sentences_with_ref


def get_citation_sentences_re(text, ref) -> list:
    # Split text into sentences using regular expression
    # The pattern looks for periods followed by a space and a capital letter, which is a simple heuristic for sentence boundaries.
    sentences = re.split(r"(?<=\.)\s+", text)

    # Filter sentences that contain '[8]'
    sentences_with_ref = [sentence for sentence in sentences if ref in sentence]
    return sentences_with_ref


def get_citation_ref_number(line) -> str:
    """Returns the citation reference number if found, otherwise None."""
    # Regular expression to find "[ number ]"
    match = re.search(r"\[\d+\]", line)

    # Extracting and printing the match
    if match:
        extracted_string = match.group()
        return extracted_string
    return None


def format_citation_ref(ref) -> str:
    """Solves formatting issue for string references from get_citation_ref_str."""
    # Replace the '[' with ', ' and remove the ']'
    converted_string = ref.replace(" [", ", ").replace("]", "")
    converted_string = converted_string.replace(" (", ", ").replace(")", "")
    return converted_string


def get_citation_ref_str(line) -> str:
    """Returns the citation reference string if found, otherwise None."""
    # Regular expression pattern to match the described criteria
    pattern = r"[A-Z][^)]+[\)|\]]"

    # Search for the pattern in the given line
    match = re.search(pattern, line)

    # Extract and print the matching string if found
    if match:
        extracted_string = match.group()
        return format_citation_ref(extracted_string)
    return None


def get_citation_ref(
    text, target_phrase="Llama: Open and efficient foundation language models"
) -> list:
    """This function looks in the references and returns
    the reference number or string how the model is cited."""
    lines = text.split("\n")

    # Filter lines that contain the specific phrase
    matching_lines = [line for line in lines if target_phrase in line]

    for line in matching_lines:
        ref_number = get_citation_ref_number(line)
        if ref_number:
            return ref_number
        ref_str = get_citation_ref_str(line)
        if ref_str:
            return ref_str
    return matching_lines

# Get citation sentences

In [13]:
sentences = []

# Loop through the files and collect citation sentences
for file in txt_files[:10]:  # Limiting to 10 files for now
    with open(file, "r") as file:
        content = file.read()
    matching_lines = get_citation_ref(content)

    if type(matching_lines) == str:
        ref = matching_lines

    for sentence in get_citation_sentences(content, ref):
        if sentence[0] != "*":
            print(f"{ref} : {sentence}")
            sentences.append(sentence)

    # if type(matching_lines) == str:
    #     print(matching_lines)
    # else:
    #     # Print matching lines
    #     for line in matching_lines:
    #         print(line)

Touvron et al., 2023a : We mainly consider foundation models including LLAMA-65B (Touvron et al., 2023a), text-davinvi-003 (Brown et al., 2020), ChatGPT and GPT-4 (OpenAI, 2023).
Touvron et al., 2023a : We compare our SKiC with zero/few-shot standard prompting (4-shot) (Brown et al., 2020), CoT (Wei et al., 2022b) and Least-to-Most prompting (LtM) (Zhou et al., 2022) on different large language models, including LLAMA-65B (Touvron et al., 2023a), text-davinvi-003 (Brown et al., 2020; Ouyang et al., 2022), and ChatGPT.
Touvron et al., 2023 : However, a large number of predictive tasks fail to naively fit into the existing supervised data distillation framework, _e.g._, image-generation (Ramesh et al., 2022; Rombach et al., 2022), language modeling (Brown et al., 2020; Devlin et al., 2019; Touvron et al., 2023), representation learning (Chen et al., 2020; Grill et al., 2020), _etc_.
Touvron et al., 2023 : ### Extending to LLaMA-Based Models

To further investigate the roles played by siz

In [18]:
# TODO: Improve citation sentence extraction and matching to the reference (foundation model) paper

# Get citation intent with MultiCite

Paper: https://arxiv.org/abs/2107.00414

In [15]:
def get_classifier(model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
    classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    return classifier
multicite = get_classifier('allenai/multicite-multilabel-scibert')

In [16]:
predictions = multicite(sentences)

In [17]:
predictions

[{'label': 'uses', 'score': 0.987091064453125},
 {'label': 'uses', 'score': 0.9294595122337341},
 {'label': 'background', 'score': 0.8526414632797241},
 {'label': 'uses', 'score': 0.9757737517356873}]