# Arxiv reference parser

Imports

In [1]:
#imports to load model
from random import randrange
from transformers import AutoTokenizer, set_seed, pipeline,BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer, GenerationConfig,LlamaConfig,LlamaModel,AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig 
import torch
import bitsandbytes as bnb
import os
import json

#imports for langchain functionalities
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.tools import BaseTool
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import transformers

from langchain.chains import ConversationChain, ConversationalRetrievalChain, SequentialChain
from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
from langchain.agents import ZeroShotAgent, AgentExecutor
from langchain.prompts import PromptTemplate

## Loading llama-2 7b

BitsAndBytes configuration

In [2]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

Load model

In [3]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB' #TODO Change if necessary

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", 
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

Model ids and bnb config

In [4]:
model_id_normal = 'meta-llama/Llama-2-7b'
model_id_normal_hf = 'meta-llama/Llama-2-7b-hf'
model_id_chat = 'meta-llama/Llama-2-7b-chat'
model_id_chat_hf = 'meta-llama/Llama-2-7b-chat-hf'

bnb_config = create_bnb_config()

In [5]:
model, tokenizer = load_model(model_id_chat_hf, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### Testing inference on the LLM

In [10]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [6]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    #stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=1024,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [7]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [20]:
llm.predict('what is the capital of Spain?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\n Unterscheidung zwischen "Spain" und "Spanish"\n\nThe capital of Spain is Madrid.\n\nIt\'s important to note that "Spain" refers to the country as a whole, while "Spanish" can refer to either the language or the people from Spain. So, for example:\n\n* "Spain is a beautiful country with a rich culture." (Here, "Spain" refers to the country.)\n* "I love speaking Spanish with my friends." (Here, "Spanish" refers to the language.)\n* "My grandparents are from Spain, so I have Spanish ancestry." (Here, "Spanish" refers to the people from Spain.)\n\nSo, to answer your question, the capital of Spain is Madrid.'

## Creating chains and prompts

### Json format

In [9]:
!pip install jsonformer

Collecting jsonformer
  Downloading jsonformer-0.12.0-py3-none-any.whl (6.6 kB)
Collecting termcolor<3.0.0,>=2.3.0
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Installing collected packages: termcolor, jsonformer
Successfully installed jsonformer-0.12.0 termcolor-2.3.0


In [8]:
from jsonformer.format import highlight_values
from jsonformer.main import Jsonformer

In [36]:
test = {
    "type": "object",
    "properties": {
        "references": {
            "type": "object",
            "properties": {
                "ref_id": {"type": "string"},
                "title": {"type": "string"},
                "author": {"type": "string"},
                "year": {"type": "string"},
            }
        },
    }
}

In [32]:
builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=test,
    prompt='''
Can you parse all the title, author and year of publications of these references?

Allen, T. D., & Rush, M. C. (2001). The influence of ratee gender 
on ratings of organizational citizenship behavior. Journal of 63
HWI, Servant Leadership, OCB and CWB in Italy
Applied Social Psychology, 31 (12), 2561-2587.  https://doi.
org/10.1111/j.1559-1816.2001.tb00191.x  
Andreassen, C. S., Nielsen, M. B., Pallesen, S., & Gjerstad, J. (2019). The 
relationship between psychosocial work variables and workaholism: 
Findings from a nationally representative survey. International Journal 
of Stress Management, 26 (1), 1-10. https://doi.org/10.1037/str0000073   
Aziz, S., Pittman, C., & Wuensch, K. (2020). Workaholism and organizational 
citizenship behaviors: Exploring gender role beliefs.  International 
Journal of Workplace Health Management, 13 (4), 413-425. https://doi.
org/10.1108/IJWHM-06-2019-0089  
Bakker, A. B., & Bal, P. M. (2010). Weekly work engagement and performance: A 
study among starting teachers. Journal of Occupational and Organizational 
Psychology, 83 (1), 189-206. https://doi.org/10.1348/096317909X402596  
Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the 
relationship between workaholism and workplace aggressive behaviour: 
The role of job-related emotion.  Personality and Individual Differences, 
53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004  
Barbaranelli, C., Fida, R., & Gulandri, M. (2013). Assessing counterproductive 
work behavior: A study on the dimensionality of CWB-checklist . TMP-
Testing, Psychometrics, Methodology in Applied Psychology, 20 (3), 235-
248.  https://doi.org/10.4473/TPM20.3.3  
Beauregard, T.  A. (2012). Perfectionism, self-efficacy and OCB: The 
moderating role of gender. Personnel Review , 41(5), 590-608.  https://
doi.org/10.1108/00483481211249120  
Bentler, P. M., & Wu, E. J. (2005). EQS 6.1 for Windows: Structural equations 
program manual . Multivariate Software.
Birkeland, I. K., & Buch, R. (2015). The dualistic model of passion for work: 
Discriminative and predictive validity with work engagement and 
workaholism. Motivation and Emotion, 39 (3), 392-408. https://doi.
org/10.1007/s11031-014-9462-x  
Borman, W. C., & Motowidlo, S. J. (1993). Expanding the criterion domain to 
include elements of contextual performance. In N. Schmitt & W. C. Borman 
(Eds.),  Personnel selection in organizations (pp. 71–98). Jossey-Bass.
Bowling, N. A., & Eschleman, K. J. (2010). Employee personality as a moderator 
of the relationships between work stressors and counterproductive work 
behavior. Journal of Occupational Health Psychology, 15 (1), 91-103. 
https://doi.org/10.1037/a0017326
Bruk-Lee, V., & Spector, P. (2006). The social stressors-counterproductive work 
behaviors link: Are conflicts with supervisors and coworkers the same? 
Journal of Occupational Health Psychology, 11 (2), 145-156. https://doi.
org/10.1037/1076- 8998.11.2.145  
Byrne, B. M. (2010). Structural equation modeling with AMOS: Basic concepts, 
applications, and programming  (2nd ed.). Routledge.
Chappell, D., & Di Martino, V. (2006).  Violence at work (3rd ed.). International 
Labour Organization.
Choi, Y. (2013). The differences between work engagement and workaholism, 
and organizational outcomes: An integrative model. Social Behavior 
and Personality, 41 (10), 1655-1666. https://doi.org/10.2224/
sbp.2013.41.10.1655
Dalal, R. S. (2005). A meta-analysis of the relationship between organizational 
citizenship behavior and counterproductive work behavior. Journal of 
Applied Psychology, 90 (6), 1241-1255. https://doi.org/10.1037/0021-
9010.90.6.1241  
Eagly, A. H. (1987). Sex differences in social behavior: A social role 
interpretation . Erlbaum.
Eagly, A., Karau, S. J., & Makajhani, M. G. (1995). Gender and the effectiveness 
of leaders: A meta-analysis. Psychological Bulletin, 117 (1), 125-145.  
https://doi.org/10.1037/0033-2909.117.1.125  
Ehrhart, M. G. (2004). Leadership and procedural justice climate as 
antecedents of unit-level organizational citizenship behavior.  Personnel 
Psychology, 57 (1), 61-94.  https://doi.org/10.1111/j.1744-6570.2004.
tb02484.x  
''')

In [33]:
output = builder()

In [34]:
output

{'ref': {'title': 'Allen, T. D., & Rush',
  'author': 'Allen, T. D., & Rush',
  'year': '2001'}}

### Guidance

In [35]:
!pip install guidance

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting guidance
  Downloading guidance-0.0.64-py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 5.7 MB/s eta 0:00:01
Collecting pyparsing>=3.0.0
  Using cached pyparsing-3.1.1-py3-none-any.whl (103 kB)
Collecting tiktoken>=0.3
  Downloading tiktoken-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 12.8 MB/s eta 0:00:01
[?25hCollecting openai>=0.27.8
  Downloading openai-1.2.4-py3-none-any.whl (220 kB)
[K     |████████████████████████████████| 220 kB 129.9 MB/s eta 0:00:01
[?25hCollecting diskcache
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.9 

In [36]:
import guidance

start to install package: redis
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
successfully installed package: redis
start to install package: redis-om
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
successfully installed package: redis-om


In [39]:
guidance.llm = guidance.llms.Transformers(model,tokenizer=tokenizer)

In [41]:
# define the prompt
program = guidance("""Given a text containing many references, re-organise each of them in a json format that contains their reference id, paper title, authors and year. 
----
```json
{
    "ref_id": "{{reference id}}"
    "title": "{{paper title}}",
    "authors": "{{authors}}",
    "year": "{{year}}",
```""")



In [42]:
examples = [
    {'input': "Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences, 53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004, Aziz, S., Pittman, C., & Wuensch, K. (2020). Workaholism and organizational citizenship behaviors: Exploring gender role beliefs.  International Journal of Workplace Health Management, 13 (4), 413-425. https://doi.org/10.1108/IJWHM-06-2019-0089  Bakker, A. B., & Bal, P. M. (2010). Weekly work engagement and performance: A study among starting teachers. Journal of Occupational and Organizational Psychology, 83 (1), 189-206. https://doi.org/10.1348/096317909X402596  Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences, 53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004  Barbaranelli, C., Fida, R., & Gulandri, M. (2013). Assessing counterproductive work behavior: A study on the dimensionality of CWB-checklist . TMP-Testing, Psychometrics, Methodology in Applied Psychology, 20 (3), 235-248.  https://doi.org/10.4473/TPM20.3.3"},
     {
         "ref_id": 'id1',
         "title": "Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences",
         "authors": "Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B.",
         "year": "2012"
     },
     {
         "ref_id": 'id2',
         "title": "Workaholism and organizational citizenship behaviors: Exploring gender role beliefs.",
         "authors": "Aziz, S., Pittman, C., & Wuensch, K.",
         "year": "2020"
     },
     {
         'more references ...'
     }
]

In [43]:
# execute the prompt
out = program(input='''Giannini, M., & Loscalzo, Y. (2016). Workaholism: Health risk and prevention 
in the organizations. In A. di Fabio (Ed.), Neuroticism: Characteristics, 
impact on job performance and health outcomes (pp. 49-60). Nova 
Science Publishers.
Graham, J. W. (1991). Servant-leadership in organizations: Inspirational 
and moral. The Leadership Quarterly, 2 (2), 105-119.  https://doi.
org/10.1016/1048-9843(91)90025-WGreenleaf, R. K. (1977).  Servant-leadership: A journey into the nature of 
legitimate power and greatness . Paulist Press.
Gruys, M. L., & Sackett, P. R. (2003). Investigating the dimensionality of 
counterproductive work behavior.  International Journal of Selection 
and Assessment, 11 (1), 30-41. https://doi.org/10.1111/1468-2389.00224
Heymans, M. W., & Eekhout, I. (2019). Applied missing data analysis with 
SPSS and ®Studio. https://bookdown.org/mwheymans/bookmi
Hofstede, G. (1980). Culture’s consequences: International differences in 
work-related values.  SAGE.
Hofstede, G. (1991). Cultures and organization: Software of the mind. 
McGraw-Hill.
Hu, L. T., & Bentler, P. M. (1999). Cut-off criteria for fit indexes in covariance 
structure analysis: Conventional criteria versus new alternatives. 
Structural Equation Modeling: A Multidisciplinary Journal, 6 (1), 1-55. 
https://doi.org/10.1080/10705519909540118
James, L. R., Mulaik, S. A., & Brett, J. M. (1982). Conditions for confirmatory 
analysis and causal inference. SAGE.''', examples=examples)

In [47]:
out["authors"]

KeyError: 'authors'

## Prompts template

In [26]:
template = """<s>[INST] <<SYS>>\n\n
You are a customer support representative for a startup called EscherCloudAI.\n
{context}
For context, EscherCloudAI does not offer refunds.\n
Answer the following customer question:\n\n
The current conversation is here: 
{chat_history}
<</SYS> 
Human: {question}
Chatbot: 
[/INST]
"""
prompt2 = PromptTemplate(
    input_variables=["chat_history", "question","context"],template=template)
memory = ConversationBufferMemory(memory_key="chat_history")

In [17]:
template = """You are a master PDF reader and when given a set of references you
    always extract the most important information of the papers. For example, when
    you were given the following references:

    Lei Jimmy Ba, Jamie Ryan Kiros, and Geoffrey E.
    Hinton. 2016. Layer normalization. CoRR ,
    abs/1607.06450.
    Eyal Ben-David, Nadav Oved, and Roi Reichart.
    2021. PADA: A prompt-based autoregressive ap-
    proach for adaptation to unseen domains. CoRR ,
    abs/2102.12206.
    Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie
    Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind
    Neelakantan, Pranav Shyam, Girish Sastry, Amanda
    Askell, Sandhini Agarwal, Ariel Herbert-V oss,
    Gretchen Krueger, Tom Henighan, Rewon Child,
    Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu,
    Clemens Winter, Christopher Hesse, Mark Chen,
    Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin
    Chess, Jack Clark, Christopher Berner, Sam Mc-
    Candlish, Alec Radford, Ilya Sutskever, and Dario
    Amodei. 2020. Language models are few-shot learn-
    ers. In Advances in Neural Information Processing
    Systems 33: Annual Conference on Neural Informa-
    tion Processing Systems 2020, NeurIPS 2020, De-
    cember 6-12, 2020, virtual .

    You extract the following:

    Layer normalization | Lei Jimmy Ba, Jamie Ryan Kiros, Geoffrey E. Hinton | 2016
    PADA: A prompt-based autoregressive approach for adaptation to unseen domains | Eyal Ben-David, Nadav Oved, Roi Reichart
    Language models are few-shot learners | Tom B. Brown, et al. | 2020

    Here is the chat history: {chat_history}
    In the References below there are many papers. Extract their titles, authors, and years.

    References: {input}

    Extracted:
    """

prompt2 = PromptTemplate(
    input_variables=["chat_history","input"],template=template)
memory = ConversationBufferMemory(memory_key="chat_history")

chain2 = ConversationChain(
    prompt=prompt2,
    llm=llm,
    memory=memory,
    verbose=True
)

In [94]:
from string import Template

template = """
AI should always respond with references in a structured JSON format.
It should not pretend to be human and should only include paper's title, authors, and year.
For multiple references, create a list under 'references' with each reference as an item.
AI should respond with 'I don't know mate!' if it doesn't know the answer.
The template should use the current conversation context and the user's input.

Always answer by saying first 'Here are the references :)'. Also, always answer in json format like this:

human: Can you format these references: "Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences, 53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004"?
AI: ```json
{{
  "message": "Here are the references :)",
  "references": [
    {{
      "ref_id": "ref1",
      "title": "Title of the first paper",
      "authors": ["Author1", "Author2"],
      "year": 2012
    }},
    {{
      "ref_id": "ref2",
      "title": "Title of the second paper",
      "authors": ["Author3", "Author4"],
      "year": 2013
    }}
  ]
}}```

Only use the paper's title, author and year, the rest of the information is irrelevant. If there are more than one reference then you need to create n reference where n is the number of reference refn+1 
Never forget, AI does not ask questions or pretend to be human, AI or anything else than AI. AI simply answer the input as truthfully as possible. If AI doesn't know the answer he says: I don't know mate!
The current conversation:
{chat_history}
Human: {input}
AI:"""

#template = Template(template)
#processed_string = template.substitute()

prompt = PromptTemplate(
    input_variables=["chat_history", "input"],template=template)
memory = ConversationBufferMemory(memory_key="chat_history")

#better example
#more concrete exmaple of context and what it does. 
#

In [None]:
template = """
AI should always respond with references in a structured JSON format.
It should not pretend to be human and should only include paper's title, authors, and year.
For multiple references, create a list under 'references' with each reference as an item.
AI should respond with 'I don't know mate!' if it doesn't know the answer.
The template should use the current conversation context and the user's input.

Always answer by saying first 'Here are the references :)'. Also, always answer in json format like this:

human: Can you format these references: "Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences, 53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004"?
AI: ```json
{{
  "message": "Here are the references :)",
  "references": [
    {{
      "ref_id": "ref1",
      "title": "Title of the first paper",
      "authors": ["Author1", "Author2"],
      "year": 2012
    }},
    {{
      "ref_id": "ref2",
      "title": "Title of the second paper",
      "authors": ["Author3", "Author4"],
      "year": 2013
    }}
  ]
}}```

Only use the paper's title, author and year, the rest of the information is irrelevant. If there are more than one reference then you need to create n reference where n is the number of reference refn+1 
Never forget, AI does not ask questions or pretend to be human, AI or anything else than AI. AI simply answer the input as truthfully as possible. If AI doesn't know the answer he says: I don't know mate!
The current conversation:
{chat_history}
Human: {input}
AI:"""


In [30]:
template2 = '''
Input:
List of references with details (title, authors, year, etc.):

"Example Title 1", "Author A, Author B",2021, "Example Title 2", "Author C, Author D", 2020, "Example Title 3", "Author E, Author F", 2019
...

Task:
Format the above list of references into a JSON structure with a unique ID for each reference, including the title, authors, and year of publication.

Expected Output:

{{
    "references": [
        {{
            "id": "ref1",
            "title": "Example Title 1",
            "authors": ["Author A", "Author B"],
            "year": 2021
        }},
        {{
            "id": "ref2",
            "title": "Example Title 2",
            "authors": ["Author C", "Author D"],
            "year": 2020
        }},
        {{
            "id": "ref3",
            "title": "Example Title 3",
            "authors": ["Author E", "Author F"],
            "year": 2019
        }},
        ...
    ]
}}

The current conversation is as follow: {chat_history}
Human: {input}
AI:

'''

prompt = PromptTemplate(
    input_variables=["chat_history", "input"],template=template2)
memory = ConversationBufferMemory(memory_key="chat_history")


In [31]:
chain3 = ConversationChain(
    prompt=prompt,
    llm=llm,
    memory=memory,
    verbose=True
)

In [32]:
answer = chain3.run('Can you format this reference? : "Allen, T. D., & Rush, M. C. (2001). The influence of ratee gender on ratings of organizational citizenship behavior. Journal of 63 HWI, Servant Leadership, OCB and CWB in Italy Applied Social Psychology, 31 (12), 2561-2587.  https://doi.org/10.1111/j.1559-1816.2001.tb00191.x  Andreassen, C. S., Nielsen, M. B., Pallesen, S., & Gjerstad, J. (2019). The relationship between psychosocial work variables and workaholism: Findings from a nationally representative survey. International Journal of Stress Management, 26 (1), 1-10. https://doi.org/10.1037/str0000073   Aziz, S., Pittman, C., & Wuensch, K. (2020). Workaholism and organizational citizenship behaviors: Exploring gender role beliefs.  International Journal of Workplace Health Management, 13 (4), 413-425. https://doi.org/10.1108/IJWHM-06-2019-0089  Bakker, A. B., & Bal, P. M. (2010). Weekly work engagement and performance: A study among starting teachers. Journal of Occupational and Organizational Psychology, 83 (1), 189-206. https://doi.org/10.1348/096317909X402596  Balducci, C., Cecchin, M., Fraccaroli, F., & Schaufeli, W. B. (2012). Exploring the relationship between workaholism and workplace aggressive behaviour: The role of job-related emotion.  Personality and Individual Differences, 53(5), 629-634. https://doi.org/10.1016/j.paid.2012.05.004  Barbaranelli, C., Fida, R., & Gulandri, M. (2013). Assessing counterproductive work behavior: A study on the dimensionality of CWB-checklist . TMP-Testing, Psychometrics, Methodology in Applied Psychology, 20 (3), 235-248.  https://doi.org/10.4473/TPM20.3.3    "')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Input:
List of references with details (title, authors, year, etc.):

"Example Title 1", "Author A, Author B",2021, "Example Title 2", "Author C, Author D", 2020, "Example Title 3", "Author E, Author F", 2019
...

Task:
Format the above list of references into a JSON structure with a unique ID for each reference, including the title, authors, and year of publication.

Expected Output:

{
    "references": [
        {
            "id": "ref1",
            "title": "Example Title 1",
            "authors": ["Author A", "Author B"],
            "year": 2021
        },
        {
            "id": "ref2",
            "title": "Example Title 2",
            "authors": ["Author C", "Author D"],
            "year": 2020
        },
        {
            "id": "ref3",
            "title": "Example Title 3",
            "authors": ["Author E", "Author F"],
            "year": 2019
        },
        ...
   

In [33]:
answer

'Sure! I can help you format the list of references into a JSON structure. Here\'s an example output based on the information provided:\n\n{\n    "references": [\n        {\n            "id": "ref1",\n            "title": "The influence of ratee gender on ratings of organizational citizenship behavior",\n            "authors": ["Allen, T. D.", "Rush, M. C."],\n            "year": 2001\n        },\n        {\n            "id": "ref2",\n            "title": "The relationship between psychosocial work variables and workaholism: Findings from a nationally representative survey",\n            "authors": ["Andreassen, C. S.", "Nielsen, M. B.", "Pallesen, S.", "Gjerstad, J."],\n            "year": 2019\n        },\n        {\n            "id": "ref3",\n            "title": "Workaholism and organizational citizenship behaviors: Exploring gender role beliefs",\n            "authors": ["Aziz, S.", "Pittman, C.", "Wuensch, K."],\n            "year": 2020\n        },\n        // Add more reference

In [37]:
builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=test,
    prompt=answer)

In [38]:
out = builder()

In [39]:
out

{'references': {'ref_id': 'ref1',
  'title': 'The influence of ratee gender on ratings of organiz',
  'author': 'Allen, T. D.',
  'year': '2001'}}

In [34]:
data = json.loads(answer)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [98]:
builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=test,
    prompt=answer)

In [99]:
out = builder()

In [100]:
out

{'references': {'ref_id': 'ref1',
  'title': 'Title of the first paper',
  'author': 'Allen, Rush',
  'year': '2001'}}

In [89]:
highlight_values(output)

{
  ref: {
    title: [32m"Allen, T. D., & Rush"[0m,
    author: [32m"Allen, T. D., & Rush"[0m,
    year: [32m"2001"[0m
  }
}


In [96]:
parsed_data = []
for line in answer.split('\n'):
    if line.startswith('ref1'):
        parts = line.split(',')
        ref_id = parts[0].split(':')[0].strip()
        title = parts[0].split(':')[1].strip()
        authors = parts[1].split(':')[1].replace('[', '').replace(']', '').strip().split(' ')
        year = int(parts[2].split(':')[1].strip())
        
        parsed_data.append({
            "ref_id": ref_id,
            "title": title,
            "authors": authors,
            "year": year
        })

# Convert to JSON
json_output = json.dumps({"references": parsed_data}, indent=4)

print(json_output)

{
    "references": []
}


In [89]:
answer['references']

TypeError: string indices must be integers

In [92]:
for i in answer:
    print('test')

ValueError: not enough values to unpack (expected 2, got 1)

In [75]:
answer.replace("  ", "")

' ```json\n{\n"message": "Here are the references :)",\n"references": [\n{\n"ref_id": "ref1",\n"title": "Title of the first paper",\n"authors": ["Allen", "Rush"],\n"year": 2001\n},\n{\n"ref_id": "ref2",\n"title": "Title of the second paper",\n"authors": ["Aziz", "Pittman", "Wuensch"],\n"year": 2020\n},\n{\n"ref_id": "ref3",\n"title": "Title of the third paper",\n"authors": ["Balducci", "Cecchin", "Fraccaroli", "Schaufeli"],\n"year": 2012\n}\n]\n}\n```'

In [78]:
import json
from pathlib import Path
from pprint import pprint

In [91]:
data = json.loads(answer)

# Now you can iterate over the 'references' array
for ref in data['references']:
    print('test')

JSONDecodeError: Expecting value: line 1 column 2 (char 1)

In [77]:
json_data = json.dumps(answer, indent=4)

# Write the JSON data to a file
with open('data.json', 'w') as file:
    file.write(json_data)

In [90]:
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(answer, f, ensure_ascii=False, indent=4)

In [82]:
data = json.loads(Path('./output.json').read_text())

In [83]:
data

' ```json\n{\n  "message": "Here are the references :)",\n  "references": [\n    {\n      "ref_id": "ref1",\n      "title": "Title of the first paper",\n      "authors": ["Allen", "Rush"],\n      "year": 2001\n    },\n    {\n      "ref_id": "ref2",\n      "title": "Title of the second paper",\n      "authors": ["Aziz", "Pittman", "Wuensch"],\n      "year": 2020\n    },\n    {\n      "ref_id": "ref3",\n      "title": "Title of the third paper",\n      "authors": ["Balducci", "Cecchin", "Fraccaroli", "Schaufeli"],\n      "year": 2012\n    }\n  ]\n}\n```'

In [58]:
answer

' ```json\n{\n  "message": "Here are the references :)",\n  "references": [\n    {\n      "ref_id": "ref1",\n      "title": "Title of the first paper",\n      "authors": ["Allen", "Rush"],\n      "year": 2001\n    },\n    {\n      "ref_id": "ref2",\n      "title": "Title of the second paper",\n      "authors": ["Aziz", "Pittman", "Wuensch"],\n      "year": 2020\n    },\n    {\n      "ref_id": "ref3",\n      "title": "Title of the third paper",\n      "authors": ["Balducci", "Cecchin", "Fraccaroli", "Schaufeli"],\n      "year": 2012\n    }\n  ]\n}\n```'

In [59]:
data = json.loads(answer)

JSONDecodeError: Expecting value: line 1 column 2 (char 1)

## Parsing only the reference page:

In [20]:
import PyPDF2

def extract_references(pdf_path):
    pdf_file = open(pdf_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    references = ""
    capture = False

    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        
        # Check if text extraction is possible
        if not text:
            print(f"No text found on page {page_num}")
            continue

        if 'References' in text or 'Bibliography' in text or 'references' in text:
            # Here you can split the text and start capturing the references section
            # This assumes that 'References' or 'Bibliography' is a unique heading
            # You may need to refine this logic depending on the actual PDF layout
            parts = text.split('References', 1)
            if len(parts) > 1:
                capture = True
                references += parts[1]
            else:
                parts = text.split('Bibliography', 1)
                if len(parts) > 1:
                    capture = True
                    references += parts[1]
        elif capture:
            # Keep capturing until we decide we're done (which is tricky to determine automatically)
            references += text

    pdf_file.close()
    return references

# Path to your PDF
pdf_path = "1576_5962_jwop_39_2_0055.pdf"
references_section = extract_references(pdf_path)

# Save or print the extracted references section
print(references_section)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

Allen, T. D., & Rush, M. C. (2001). The influence of ratee gender 
on ratings of organizational citizenship behavior. Journal of 63
HWI, Servant Leadership, OCB and CWB in Italy
Applied Social Psychology, 31 (12), 2561-2587.  https://doi.
org/10.1111/j.1559-1816.2001.tb00191.x  
Andreassen, C. S., Nielsen, M. B., Pallesen, S., & Gjerstad, J. (2019). The 
relationship between psychosocial work variables and workaholism: 
Findings from a nationally representative survey. International Journal 
of Stress Management, 26 (1), 1-10. https://doi.org/10.1037/str0000073   
Aziz, S., Pittman, C., & Wuensch, K. (2020). Workaholism and organizational 
citizenship behaviors: Exploring gender role beliefs.  International

In [24]:
prompt = """
You need to extract the reference from the text and re-structure it in this format: 
{ref1:{"name":"X", "author":"Y",....... }}- ...

Re organize the following from:
"""