In [1]:
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np
import json
import seaborn as sns
from tqdm import tqdm
from os.path import basename
from scipy.stats import gaussian_kde
from utils import *
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List
import instructor
from enum import Enum
import jsonlines

client = instructor.from_openai(OpenAI(
    api_key = "sk-"
))

In [3]:
df = pd.read_csv("data/task_9_citation_classification_mcllm - Sheet6.csv")
df.head()

Unnamed: 0,sentence,mcllm,modelKey,paperId,modelId_x,mc,modelId_y,mc_reduced,urop_sentence,urop,urop_assignee
0,"in recent years, many large-scale pre-trained ...",background,417_gpt-3_175b_(davinci),d8d578d4ece329f17b025946587b1751721b9144,6b85b63579a916f705a8e10a49bd8d849d91b1fc,background,6b85b63579a916f705a8e10a49bd8d849d91b1fc,context,"in recent years, many large-scale pre-trained ...",Background,Selinna
1,one of the known problems with contrastive tra...,background,1013_wave2vec_2.0_large,7f0c7c324675179f0e32c160d99c7066c7ab30ae,49a049dc85e2380dde80501a984878341dd8efdf,background,49a049dc85e2380dde80501a984878341dd8efdf,context,one of the known problems with contrastive tra...,Background,Selinna
2,further work in compound scaling yielded model...,background,377_efficientnet-l2,970cb7b5b25da0f1f8b000add10960680fe8cd2e,4f2eda8077dc7a69bb2b4e0a1a086cf054adb3f9,background,4f2eda8077dc7a69bb2b4e0a1a086cf054adb3f9,context,further work in compound scaling yielded model...,Future work,Selinna
3,"due to the high computing cost, conducting a t...",background,417_gpt-3_175b_(davinci),b6ec1e8f18185b4b3d46201359a440404575460c,6b85b63579a916f705a8e10a49bd8d849d91b1fc,background,6b85b63579a916f705a8e10a49bd8d849d91b1fc,context,"due to the high computing cost, conducting a t...",Background,Selinna
4,## 1 introduction\n\n\nthe multilingual bert ...,background,1064_bert-large,1234fcc1577a32b829d2886fdf68375b9d4525e9,df2b0e26d0599ce3e70df8a9da02e51594e0e992,background,df2b0e26d0599ce3e70df8a9da02e51594e0e992,context,## 1 introduction\n\nthe multilingual bert mo...,Background,Denis


In [5]:
class CitationIntent(str, Enum):
    background = 'background'
    motivation = 'motivation'
    uses = 'uses'
    extends = 'extends'
    differences = 'differences'
    similarities = 'similarities'
    future_work = 'future_work'
    None

class CitationSentence(BaseModel):
    citation_intent: CitationIntent = Field(..., title="Citation intent in the citation sentence.")

In [7]:
system_msg = '''You are an assistant tasked with classifying citation sentences from scientific papers. The model is denoted with <cite></cite>. Each sentence should be categorized based on how a model is used in the reseach:

Classify as 'background' if the sentence provides context or foundational information.
Classify as 'motivation' if the sentence explains the reasons or needs for choosing or developing the model.
Classify as 'extends' if the sentence indicates improvements or developments on the model.
Classify as 'uses' if the sentence describes the application or operational use of the model.
Classify as 'differences' if the sentence uses the model for difference comparisons or to highlight outcomes.
Classify as 'similarities' if the sentence uses the model for similarity comparisons or to highlight outcomes.
Classify as 'future_work' if the sentence discusses prospective studies or subsequent steps.'''

In [8]:
results = []
for index, row in tqdm(df.iterrows()):
    sentence = row['sentence']
    
    user_msg = (
        "Extract the citation intent from the sentence: "
        + sentence
    )
    if sentence is None:
        continue
    
    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_model=CitationSentence,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
    )
    results.append(chat_completion.citation_intent.name)

3it [00:02,  1.50it/s]

201it [01:55,  1.74it/s]


In [9]:
df['gpt'] = results

In [10]:
df.head()

Unnamed: 0,sentence,mcllm,modelKey,paperId,modelId_x,mc,modelId_y,mc_reduced,urop_sentence,urop,urop_assignee,gpt
0,"in recent years, many large-scale pre-trained ...",background,417_gpt-3_175b_(davinci),d8d578d4ece329f17b025946587b1751721b9144,6b85b63579a916f705a8e10a49bd8d849d91b1fc,background,6b85b63579a916f705a8e10a49bd8d849d91b1fc,context,"in recent years, many large-scale pre-trained ...",Background,Selinna,background
1,one of the known problems with contrastive tra...,background,1013_wave2vec_2.0_large,7f0c7c324675179f0e32c160d99c7066c7ab30ae,49a049dc85e2380dde80501a984878341dd8efdf,background,49a049dc85e2380dde80501a984878341dd8efdf,context,one of the known problems with contrastive tra...,Background,Selinna,motivation
2,further work in compound scaling yielded model...,background,377_efficientnet-l2,970cb7b5b25da0f1f8b000add10960680fe8cd2e,4f2eda8077dc7a69bb2b4e0a1a086cf054adb3f9,background,4f2eda8077dc7a69bb2b4e0a1a086cf054adb3f9,context,further work in compound scaling yielded model...,Future work,Selinna,future_work
3,"due to the high computing cost, conducting a t...",background,417_gpt-3_175b_(davinci),b6ec1e8f18185b4b3d46201359a440404575460c,6b85b63579a916f705a8e10a49bd8d849d91b1fc,background,6b85b63579a916f705a8e10a49bd8d849d91b1fc,context,"due to the high computing cost, conducting a t...",Background,Selinna,motivation
4,## 1 introduction\n\n\nthe multilingual bert ...,background,1064_bert-large,1234fcc1577a32b829d2886fdf68375b9d4525e9,df2b0e26d0599ce3e70df8a9da02e51594e0e992,background,df2b0e26d0599ce3e70df8a9da02e51594e0e992,context,## 1 introduction\n\nthe multilingual bert mo...,Background,Denis,background


In [11]:
df.to_csv("data/citation_intent_mcll_mc_urop_gpt_200.csv", index=False)