In [178]:
import sys
import pandas as pd
import numpy as np
import os
from unidecode import unidecode

### Questions:
- what is the author id of this professor?
- what is the paperId of this paper?
- what are the papers of this author?
- what papers were published in this in this venue (conference)?
- What is the H- index of Professor X?
- What is the author citation count of Professor X? (authorCitationCount)
- What is the most cited paper from this faculty member?
- What is the most cited paper from this faculty member and its URL?
- Who are the authors of the most cited paper?
- who is the first author of this given paper (user gives title)?
- How many papers has this faculty member published in open access journals?
- What are the journals that this faculty member has published in?
- What are the journals that this faculty member has published in, and how many papers in each journal?
- What are the fields of study of this faculty member? (fieldsOfStudy)
- Which venue was this paper published in?


In [179]:
base_dir = '../data/paper_jsons'

In [180]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Hauptmann_145788702.json', 'A. Lavie_1784914.json', 'A. Rudnicky_3156164.json', 'A. Waibel_1724972.json', 'A. Waibel_2064429921.json', 'Alexander Hauptmann_7661726.json', 'Alexander I. Rudnicky_1783635.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rose_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_2869551.json', 'Mona T. Diab_1

In [181]:
# Class to get questions:
class FacultyPublicationAnalysis:
    def __init__(self, json_path):
        self.df = pd.read_json(json_path)
        self.json_path = json_path
        self.df = self.df
        self.prof_name = self.df['profName'].iloc[0]  
        self.results = []

    def add_result(self, question, answer, document_name="", notes=""):
        self.results.append({
            "Question": question,
            "Answer": answer,
            "Document":  self.json_path,
            "Notes": notes
        })

    def get_authorId(self):
        author_id = set(self.df['authorId'])
        if len(author_id) >1:
            author_id = ', '.join([str(x) for x in author_id])
        else:
            author_id = author_id.pop()
        self.add_result(f"What is the author ID of {self.prof_name}?", str(author_id))

    def get_hIndex(self):
        h_index = set(self.df['authorHIndex'])
        if len(h_index) >1:
            h_index = ', '.join([str(x) for x in h_index])
        else:
            h_index = h_index.pop()
        self.add_result(f"What is the H-index of {self.prof_name}?", str(h_index))
        
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].iloc[0]
        self.add_result(f"What is the paper ID of '{title}'?", str(paper_id), title)

    def get_OpenAccessCount(self):
        # How many papers has this faculty member published in open access journals?
        open_access_papers = self.df[self.df['isOpenAccess'] == True].shape[0]
        self.add_result(f"How many papers has {self.prof_name} published in open access journals?", str(open_access_papers))

    def get_authors_papers(self):
        papers = self.df['title'].tolist()
        self.add_result(f"What are the papers written by {self.prof_name}?", ', '.join(papers))

    def get_author_PaperCount(self):
        citation_count = self.df['authorPaperCount'].max()
        self.add_result(f"What is the author paper count of {self.prof_name}?", str(citation_count))
    
    def get_author_CitationCount(self):
        citation_count = self.df['authorCitationCount'].max()
        self.add_result(f"What is the author citation count of {self.prof_name}?", str(citation_count))

    def get_journals(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna().unique()
        journals_string = ', '.join(journals)
        self.add_result(f"What journals has {self.prof_name} published in?", journals_string)

    def get_journal_count(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna()
        journal_counts = journals.value_counts().to_dict()
        if len(journal_counts) > 1:
            journal_counts = ', '.join([f"{count} in {journal}" for journal, count in journal_counts.items()])
        else:
            journal_counts = 'No journal data available.'
        
        self.add_result(f"What are the journals and how many papers has {self.prof_name} published in each?", str(journal_counts))

    def get_venues(self):
        journals_filtered = [journal for journal in self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
        journals_string = ', '.join(journals_filtered)
        venues = self.df['venue'].unique()
        self.add_result(f"What venues has {self.prof_name} published in?", ', '.join(venues))
    
    
    def get_fieldsOfStudy(self):
        fields = self.df['fieldsOfStudy'].explode().dropna().unique()
        self.add_result(f"What are the fields of study of {self.prof_name}?", ', '.join(fields))

    def get_most_cited_paper(self):
        most_cited = self.df.loc[self.df['citationCount'].idxmax()]
        self.add_result(f"What is the most cited paper from {self.prof_name}?", most_cited['title'])
        url = self.get_pdfurl(most_cited['title'])
        self.add_result(f"What is the url of the most cited paper from {self.prof_name}?", url)
        authors = self.get_paper_authors(most_cited['title'])
        self.add_result(f"Who are the authors of the most cited paper from {self.prof_name}?", authors)
        tldr = self.get_paper_tldr(most_cited['title'])
        self.add_result(f"TLDR/Summary of the most cited paper from {self.prof_name}?", tldr)
        abstract = self.get_paper_abstract(most_cited['title'])
        self.add_result(f"Abstract of the most cited paper from {self.prof_name}?", abstract)
        return most_cited['title']
    
    def get_pdfurl(self, title): # asked ChatGpt because was getting none errr before and my handling was not working
        pdf_data = self.df[self.df['title'] == title]['openAccessPdf']
        if not pdf_data.empty and pd.notna(pdf_data.iloc[0]):
            try:
                url = pdf_data.iloc[0].get('url', 'openAccessPdf not available')
            except AttributeError:
                url = 'openAccessPdf data format unexpected'
        else:
            url = 'openAccessPdf not available'
        return url


    def get_paper_journal(self, title):
        journal = self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).iloc[0]
        volume =  self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('volume') if isinstance(x, dict) else None).iloc[0]
        pages =  self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('pages') if isinstance(x, dict) else None).iloc[0]
        
        journal_edited = None
        
        if journal is None:
            journal = self.get_paper_venue(title)
        
        if (volume is not None) and (pages is None):
            journal_edited = f"{journal}, volume: {volume}"
        if (pages is not None) and (volume is None):
            journal_edited = f'{journal}, pages: {pages}'
        if (volume is not None) and (pages is not None):
            journal_edited = f"{journal}, volume: {volume}, pages: {pages}"
        
        if journal_edited is None:
            journal_edited = journal
        else:
            journal = f'{journal_edited}; {journal}'
        
        return journal

    def get_paper_venue(self, title):
        venue = self.df[self.df['title'] == title]['venue'].values[0]
        return venue
    
    def get_paper_citations(self, title):
        citations = self.df[self.df['title'] == title]['citationCount'].values[0]
        return citations
    
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].values[0]
        return paper_id
    
    def get_paper_authors(self, title, return_list = False):
        authors = self.df[self.df['title'] == title]['authors'].iloc[0]
        author_names = [unidecode(author['name']) for author in authors] 
        if return_list is False: # so we can get first authors
            author_names = ', '.join(author_names)
        return author_names
   
    def get_papers_from_venue(self, venue_name):
        papers = self.df[self.df['venue'] == venue_name]['title'].tolist()
        question = f"What papers were published in the venue {venue_name}?"
        return papers
        # self.add_result(f"What papers were published in the venue {venue_name}?", ', '.join(papers), notes=venue_name)
    
    def get_paper_tldr(self, title):
        tldr = self.df[self.df['title'] == title]['tldr'].tolist()[0]
        return tldr
    
    def get_paper_abstract(self, title):
        abstract = self.df[self.df['title'] == title]['abstract'].tolist()[0]
        return abstract
    
    
    def export_to_df(self):
        return pd.DataFrame(self.results)

    def export_to_csv(self, filename='results.csv'):
        pd.DataFrame(self.results).to_csv(filename, index=False)

In [182]:
samples_per_json = 3
combined_json_df = pd.DataFrame()
combined_results_df = pd.DataFrame()

for json_file in json_files:
    json_file_path = f'{base_dir}/{json_file}'
    analysis = FacultyPublicationAnalysis(json_file_path)
    analysis.get_authorId() # What is the author ID of the faculty member?
    analysis.get_authors_papers() # What are the papers of the faculty member?
    analysis.get_hIndex() # What is the H-index of the faculty member?
    analysis.get_author_CitationCount() # What is the author citation count of the faculty member?
    analysis.get_author_PaperCount() # What is the author paper count of the faculty member?
    analysis.get_journals() # What journals has the faculty member published in?
    analysis.get_journal_count() # What are the journals and how many papers has the faculty member published in each?
    analysis.get_fieldsOfStudy() # What are the fields of study of the faculty member?
    analysis.get_OpenAccessCount() # How many papers has this faculty member published in open access journals?
    analysis.get_venues() # What venues has the faculty member published in?
    most_cited_title = analysis.get_most_cited_paper() # What is the most cited paper from the faculty member?

    analysis_df = analysis.df
    combined_json_df = pd.concat([combined_json_df, analysis_df], ignore_index=True)
    
    result_df = analysis.export_to_df()
    
    samples = samples_per_json
    randomly_sampled_papers = analysis_df['title'].sample(min(len(analysis_df), samples), ignore_index=True).values
    
    
    new_questions = []
    for paper in randomly_sampled_papers:
        journal = analysis.get_paper_journal(paper)
        question_1 = f'What journal was the paper "{paper}" published in?'
        
        venue = analysis.get_paper_venue(paper)
        question_2 = f'What venue was the paper "{paper}" published in?'
        
        citations = analysis.get_paper_citations(paper)
        question_3 = f'How many citations does the paper "{paper}" have?'
        
        authors = analysis.get_paper_authors(paper)
        question4 = f'Who are the authors of the paper "{paper}"?'
        
        author = authors.split(',')[0]
        question5 = f'Who is the first author of the paper "{paper}"?'
        
        paper_id = analysis.get_paperId(paper)
        question6 = f'What is the paper ID of the paper "{paper}"?'
        question7 = f'What paper has the paper ID {paper_id}?'
        
        tldr = analysis.get_paper_tldr(paper)
        question8 = f"What is the summary/TLDR of the paper '{paper}'?"
        
        abstract = analysis.get_paper_abstract(paper)
        # question9 = f"What is the abstract of the paper '{paper}'?"
        
        new_data = [
        {"Question": question_1, "Answer": journal, "Document": json_file_path, "Notes": ""},
        {"Question": question_2, "Answer": venue, "Document": json_file_path, "Notes": ""},
        {"Question": question_3, "Answer": citations, "Document": json_file_path, "Notes": ""},
        {"Question": question4, "Answer": authors, "Document": json_file_path, "Notes": ""},
        {"Question": question5, "Answer": author, "Document": json_file_path, "Notes": ""},
        {"Question": question6, "Answer": paper_id, "Document": json_file_path, "Notes": ""},
        {"Question": question7, "Answer": paper, "Document": json_file_path, "Notes": ""},
        {"Question": question8, "Answer": tldr, "Document": json_file_path, "Notes": ""},
        # {"Question": question9, "Answer": abstract, "Document": json_file_path, "Notes": ""}
        ]
        
        new_results_df = pd.DataFrame(new_data)
        result_df = pd.concat([result_df, new_results_df], ignore_index=True)
    combined_results_df = pd.concat([combined_results_df, result_df], ignore_index=True)

In [183]:
combined_json_df.to_csv('../data/paper_logs/combined_json_data.csv', index=False)
combined_results_df.to_csv('../data/paper_logs/combined_qa_dataset.csv', index=False)

In [184]:
# Most cited faculy member (overall)
most_cited_faculty = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['profName']
most_citations = combined_json_df['authorCitationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['title']


print(f"Who is the the most cited faculty member?")
print(most_cited_faculty)
print(f"What is the number of citations of the most cited faculty member? {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

Who is the the most cited faculty member?
Yiming Yang
What is the number of citations of the most cited faculty member? 47104
The most cited paper is 'Accelerating Diffusion-based Combinatorial Optimization Solvers by Progressive Distillation'


In [185]:
# Most cited faculy member (2023)
most_cited_faculty = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['profName']
most_citations = combined_json_df['citationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['title']

print(f"Who is the the most cited faculty member i?")
print(most_cited_faculty)
print(f"What is the number of citations of the most cited in 2023 faculty member? {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

Who is the the most cited faculty member i?
Eric P. Xing
What is the number of citations of the most cited in 2023 faculty member? 690
The most cited paper is 'Judging LLM-as-a-judge with MT-Bench and Chatbot Arena'


In [186]:
# most no of publications in 2023, open Access
prof_name_counts = combined_json_df['profName'].value_counts().sort_values(ascending=False)
# len(prof_name_counts)
prof_name_counts

profName
Shinji Watanabe            58
Graham Neubig              27
Yiming Yang                25
Bhiksha Raj                20
Louis-Philippe Morency     16
Yulia Tsvetkov             16
Maarten Sap                13
Emma Strubell              10
Brian MacWhinney           10
Chenyan Xiong              10
Alexander Waibel            9
Taylor Berg-Kirkpatrick     9
Yonatan Bisk                8
Jeffrey Bigham              8
Mona Diab                   8
Lei Li                      8
Alexander Rudnicky          7
David R Mortensen           7
Rita Singh                  7
Malihe Alikhani             7
Lu Jiang                    6
Alon Lavie                  6
Daniel Fried                6
Alexander Hauptmann         5
Jamie Callan                5
Carolyn Rose                5
Matt Gormley                4
Norman Sadeh                4
Daphne Ippolito             4
Eric P. Xing                4
William Cohen               4
Eric Nyberg                 4
Roni Rosenfeld              3
F

In [187]:
print('Which faculty member has the most publications in open access journals in 2023?')
print(prof_name_counts.index[0])
print('Which faculty member has the second most publications in open access journals in 2023?')
print(prof_name_counts.index[1])
print('Which faculty member has the third most publications in open access journals in 2023?')
print(prof_name_counts.index[2])
print('Which faculty member has the least publications in open access journals in 2023?')
print(prof_name_counts.index[-1])

Which faculty member has the most publications in open access journals in 2023?
Shinji Watanabe
Which faculty member has the second most publications in open access journals in 2023?
Graham Neubig
Which faculty member has the third most publications in open access journals in 2023?
Yiming Yang
Which faculty member has the least publications in open access journals in 2023?
Scott Fahlman


# Getting Author Level Data

In [188]:
# Creating AuthorLevel Dataset
# combined_json_df = pd.DataFrame()
author_results_df = pd.DataFrame()

for json_file in json_files:
    json_file_path = f'{base_dir}/{json_file}'
    analysis = FacultyPublicationAnalysis(json_file_path)
    analysis.get_authorId() # What is the author ID of the faculty member?
    analysis.get_authors_papers() # What are the papers of the faculty member?
    analysis.get_hIndex() # What is the H-index of the faculty member?
    analysis.get_author_CitationCount() # What is the author citation count of the faculty member?
    analysis.get_author_PaperCount() # What is the author paper count of the faculty member?
    analysis.get_journals() # What journals has the faculty member published in?
    analysis.get_journal_count() # What are the journals and how many papers has the faculty member published in each?
    analysis.get_fieldsOfStudy() # What are the fields of study of the faculty member?
    analysis.get_OpenAccessCount() # How many papers has this faculty member published in open access journals?
    analysis.get_venues() # What venues has the faculty member published in?
    most_cited_title = analysis.get_most_cited_paper() # What is the most cited paper from the faculty member?

    analysis_df = analysis.df
    # combined_json_df = pd.concat([combined_json_df, analysis_df], ignore_index=True)
    
    result_df = analysis.export_to_df()    
    author_results_df = pd.concat([author_results_df, result_df], ignore_index=True)
    

In [189]:
author_results_df.to_csv('../data/paper_logs/author.csv', index=False)

# creating csv per paper

In [190]:
combined_json_df.columns

Index(['profName', 'authorId', 'authorName', 'authorUrl', 'authorHIndex',
       'authorAffiliations', 'authorPaperCount', 'authorCitationCount',
       'paperId', 'externalIds', 'url', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'citationCount', 'influentialCitationCount',
       'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 'journal', 'authors',
       'tldr'],
      dtype='object')

In [191]:
pd.DataFrame(combined_json_df.iloc[0])

Unnamed: 0,0
profName,Alexander Hauptmann
authorId,145788702
authorName,A. Hauptmann
authorUrl,https://www.semanticscholar.org/author/145788702
authorHIndex,27
authorAffiliations,[]
authorPaperCount,59
authorCitationCount,2295
paperId,2107b867cb8f8afa30a9a940288d7c8b657f8aa5
externalIds,"{'ACL': '2023.acl-short.127', 'DBLP': 'conf/ac..."


In [299]:
# Class to get questions:
class PaperAnalysis:
    def __init__(self, paper_row):
        self.results = []
        # self.df  = pd.DataFrame(paper_row).T
        self.df = paper_row
        self.prof_name = paper_row['profName']
        self.paperId = paper_row['paperId']
        self.title = paper_row['title']
        self.notes = f"##Author: {self.prof_name}, ##Title: {self.title}"
    
    def add_result(self, question, answer):
        
        self.results.append({
            "Question": question,
            "Answer": answer,
            # "Document":  self.csv_path,
            "Notes": self.notes
        })
    
    def get_pdfurl(self): 
        pdf_data = self.df['openAccessPdf']
        if pdf_data is not None:
            try:
                url = pdf_data.get('url', 'openAccessPdf not available')
            except AttributeError:
                url = 'openAccessPdf data format unexpected'
        else:
            url = 'openAccessPdf not available'
        return url

    def get_fieldofstudy(self):
        paper_fields_of_study = self.df['fieldsOfStudy']
        if isinstance(paper_fields_of_study, list):
            paper_fields_of_study = ', '.join(paper_fields_of_study)
        else: 
            paper_fields_of_study = 'No fields of study available'
        return paper_fields_of_study
    
    def get_paper_journal(self):
        journal_info = self.df.get('journal', {}) 
        journal = journal_info.get('name') 
        volume = journal_info.get('volume') 
        pages = journal_info.get('pages') 
                
        if journal is None:
            journal = self.df['venue']
        
        journal_edited = None
        
        if (volume is not None) and (pages is None):
            journal_edited = f"{journal}, volume: {volume}"
        if (pages is not None) and (volume is None):
            journal_edited = f'{journal}, pages: {pages}'
        if (volume is not None) and (pages is not None):
            journal_edited = f"{journal}, volume: {volume}, pages: {pages}"
        
        if journal_edited is None:
            journal_edited = journal
        else:
            journal = f'{journal_edited}; {journal}'
        
        return journal

    
    def get_paper_authors(self, return_list = False):
        authors = self.df['authors']
        author_names = [unidecode(author['name']) for author in authors] 
        if return_list is False: # so we can get first authors
            author_names = ', '.join(author_names)
        return author_names
    
    def get_affiliation(self):
        affiliation = self.df['authorAffiliations']
   
        if affiliation:
            self.results.append({
                "Question": f"What is the affiliation of {self.prof_name}?",
                "Answer": ', '.join(affiliation) if isinstance(affiliation, list) else str(affiliation),
                "Notes": self.notes
            })
        else:
            self.results.append({
                "Question": f"What is the affiliation of {self.prof_name}?",
                "Answer": "LTI (CMU), No other affiliations on Semantic Scholar",
                "Notes": self.notes
            })
        # return affiliation
        
    def get_all_results(self):
        
        self.results.append({
            "Question": f"What is the name of this paper?",
            "Answer": str(self.title),
            "Notes": self.notes
        })

        
        author_id = self.df['authorId']
        self.results.append({
            "Question": f"What is the author ID of {self.prof_name}?",
            "Answer": str(author_id),
            "Notes": self.notes
        })

        h_index = self.df['authorHIndex']
        self.results.append({
            "Question": f"What is the H-index of {self.prof_name}?",
            "Answer": str(h_index),
            "Notes": self.notes
        })
        
        authorName = self.df['authorName']
        self.results.append({
            "Question": f"What is the semantic scholar author name of {self.prof_name}?",
            "Answer": str(authorName),
            "Notes": self.notes
        })
        
        
        auhtorUrl = self.df['authorUrl']
        self.results.append({
            "Question": f"What is the semantic scholar author name of {self.prof_name}?",
            "Answer": str(auhtorUrl),
            "Notes": self.notes
        })
        
        affiliation = self.get_affiliation()
        

        paper_id = self.df['paperId']
        self.results.append({
            "Question": f"What is the paper ID of the paper {self.title}?",
            "Answer": str(paper_id),
            "Notes": self.notes
        })

        external_ids = self.df['externalIds']
        self.results.append({
            "Question": f"What are the external IDs of the paper {self.title}?",
            "Answer": str(external_ids),
            "Notes": self.notes
        })
        
        paper_url = self.df['url']
        self.results.append({
            "Question": f"What is the URL of the paper {self.title}?",
            "Answer": str(paper_url),
            "Notes": self.notes
        })

        paper_title = self.title
        paper_abstract = self.df['abstract']
        self.results.append({
            "Question": f"What is the abstract of the paper '{paper_title}'?",
            "Answer": str(paper_abstract),
            "Notes": self.notes
        })

        paper_venue = self.df['venue']
        self.results.append({
            "Question": f"In which venue was the paper '{paper_title}' published?",
            "Answer": str(paper_venue),
            "Notes": self.notes
        })

        paper_year = self.df['year']
        self.results.append({
            "Question": f"In what year was the paper '{paper_title}' published?",
            "Answer": str(paper_year),
            "Notes": self.notes
        })

        paper_reference_count = self.df['referenceCount']
        self.results.append({
            "Question": f"How many references are in the paper '{paper_title}'?",
            "Answer": str(paper_reference_count),
            "Notes": self.notes
        })

        paper_citation_count = self.df['citationCount']
        self.results.append({
            "Question": f"How many citations does the paper '{paper_title}' have?",
            "Answer": str(paper_citation_count),
            "Notes": self.notes
        })
        self.results.append({
            "Question": f"What is the citation count of '{paper_title}' have?",
            "Answer": str(paper_citation_count),
            "Notes": self.notes
        })
        
        paper_influential_citation_count = self.df['influentialCitationCount']
        self.results.append({
            "Question": f"How many influential citations does the paper '{paper_title}' have?",
            "Answer": str(paper_influential_citation_count),
            "Notes": self.notes
        })

        paper_is_open_access = self.df['isOpenAccess']
        self.results.append({
            "Question": f"Is the paper '{paper_title}' open access?",
            "Answer": "Yes" if paper_is_open_access else "No",
            "Notes": self.notes
        })

        paper_open_access_pdf =self.get_pdfurl()
        self.results.append({
            "Question": f"What is the open access PDF URL of the paper titled '{paper_title}'?",
            "Answer": str(paper_open_access_pdf),
            "Notes": self.notes
        })

        paper_fields_of_study = self.get_fieldofstudy()
        self.results.append({
            "Question": f"What are the fields of study for the paper titled '{paper_title}'?",
            "Answer": (paper_fields_of_study),
            "Notes": self.notes
        })

        paper_journal = self.get_paper_journal()
        self.results.append({
            "Question": f"What is the journal name for the paper titled '{paper_title}'?",
            "Answer": paper_journal,
            "Notes": self.notes
        })

        paper_authors = self.get_paper_authors()
        self.results.append({
            "Question": f"Who are the authors of the paper '{paper_title}'?",
            "Answer": str(paper_authors),
            "Notes": self.notes
        })

        paper_tldr = self.df['tldr']
        self.results.append({
            "Question": f"What is the TLDR summary of the paper '{paper_title}'?",
            "Answer": paper_tldr,
            "Notes": self.notes
        })

        return self.results

    def export_to_csv(self, base_dir=None, filename=None):
        dir = base_dir
        if base_dir is None:
            dir = '../data/papers_metadata_csv'
        if filename is None:
            filename = f'{dir}/{self.paperId}.csv'

        pd.DataFrame(self.results).to_csv(filename, index=False)

In [300]:
len(combined_json_df)

366

In [301]:
for i in range(len(combined_json_df)): #len(combined_json_df)
    row = combined_json_df.iloc[i]
    paper_analysis = PaperAnalysis(row)
    # print(paper_analysis.columns)
    paper_results = paper_analysis.get_all_results()
    paper_analysis.export_to_csv()

In [297]:
test = combined_json_df.iloc[1]
# test

In [280]:
test['journal']

{'volume': 'abs/2306.17842', 'name': 'ArXiv'}