In [67]:
import sys
import pandas as pd
import numpy as np
import os
from unidecode import unidecode

### Questions:
- what is the author id of this professor?
- what is the paperId of this paper?
- what are the papers of this author?
- what papers were published in this in this venue (conference)?
- What is the H- index of Professor X?
- What is the author citation count of Professor X? (authorCitationCount)
- What is the most cited paper from this faculty member?
- What is the most cited paper from this faculty member and its URL?
- Who are the authors of the most cited paper?
- who is the first author of this given paper (user gives title)?
- How many papers has this faculty member published in open access journals?
- What are the journals that this faculty member has published in?
- What are the journals that this faculty member has published in, and how many papers in each journal?
- What are the fields of study of this faculty member? (fieldsOfStudy)
- Which venue was this paper published in?


In [68]:
base_dir = '../data/paper_jsons'

In [69]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Hauptmann_145788702.json', 'A. Lavie_1784914.json', 'A. Rudnicky_3156164.json', 'A. Waibel_1724972.json', 'A. Waibel_2064429921.json', 'Alexander Hauptmann_7661726.json', 'Alexander I. Rudnicky_1783635.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rose_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_2869551.json', 'Mona T. Diab_1

In [70]:
# Class to get questions:
class FacultyPublicationAnalysis:
    def __init__(self, json_path):
        self.df = pd.read_json(json_path)
        self.json_path = json_path
        self.df = self.df
        self.prof_name = self.df['profName'].iloc[0]  
        self.results = []

    def add_result(self, question, answer, document_name="", notes=""):
        self.results.append({
            "Question": question,
            "Answer": answer,
            "Document":  self.json_path,
            "Notes": notes
        })

    def get_authorId(self):
        author_id = set(self.df['authorId'])
        if len(author_id) >1:
            author_id = ', '.join([str(x) for x in author_id])
        else:
            author_id = author_id.pop()
        self.add_result(f"What is the author ID of {self.prof_name}?", str(author_id))

    def get_hIndex(self):
        h_index = set(self.df['authorHIndex'])
        if len(h_index) >1:
            h_index = ', '.join([str(x) for x in h_index])
        else:
            h_index = h_index.pop()
        self.add_result(f"What is the H-index of {self.prof_name}?", str(h_index))
        
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].iloc[0]
        self.add_result(f"What is the paper ID of '{title}'?", str(paper_id), title)

    def get_OpenAccessCount(self):
        # How many papers has this faculty member published in open access journals?
        open_access_papers = self.df[self.df['isOpenAccess'] == True].shape[0]
        self.add_result(f"How many papers has {self.prof_name} published in open access journals?", str(open_access_papers))

    def get_authors_papers(self):
        papers = self.df['title'].tolist()
        self.add_result(f"What are the papers of {self.prof_name}?", ', '.join(papers))


    def get_author_CitationCount(self):
        citation_count = self.df['authorCitationCount'].max()
        self.add_result(f"What is the author citation count of {self.prof_name}?", str(citation_count))

    def get_journals(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna().unique()
        journals_string = ', '.join(journals)
        self.add_result(f"What journals has {self.prof_name} published in?", journals_string)

    def get_journal_count(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna()
        journal_counts = journals.value_counts().to_dict()
        if len(journal_counts) > 1:
            journal_counts = ', '.join([f"{count} in {journal}" for journal, count in journal_counts.items()])
        else:
            journal_counts = 'No journal data available.'
        
        self.add_result(f"What are the journals and how many papers has {self.prof_name} published in each?", str(journal_counts))

    def get_venues(self):
        journals_filtered = [journal for journal in self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
        journals_string = ', '.join(journals_filtered)
        venues = self.df['venue'].unique()
        self.add_result(f"What venues has {self.prof_name} published in?", ', '.join(venues))
    
    
    def get_fieldsOfStudy(self):
        fields = self.df['fieldsOfStudy'].explode().dropna().unique()
        self.add_result(f"What are the fields of study of {self.prof_name}?", ', '.join(fields))

    def get_most_cited_paper(self):
        most_cited = self.df.loc[self.df['citationCount'].idxmax()]
        self.add_result(f"What is the most cited paper from {self.prof_name}?", most_cited['title'])
        url = self.get_pdfurl(most_cited['title'])
        self.add_result(f"What is the url of the most cited paper from {self.prof_name}?", url)
        authors = self.get_paper_authors(most_cited['title'])
        self.add_result(f"Who are the authors of the most cited paper from {self.prof_name}?", authors)
        tldr = self.get_paper_tldr(most_cited['title'])
        self.add_result(f"TLDR/Summary of the most cited paper from {self.prof_name}?", tldr)
        abstract = self.get_paper_abstract(most_cited['title'])
        self.add_result(f"Abstract of the most cited paper from {self.prof_name}?", abstract)
        return most_cited['title']
    
    def get_pdfurl(self, title): # asked ChatGpt because was getting none errr before and my handling was not working
        pdf_data = self.df[self.df['title'] == title]['openAccessPdf']
        if not pdf_data.empty and pd.notna(pdf_data.iloc[0]):
            try:
                url = pdf_data.iloc[0].get('url', 'openAccessPdf not available')
            except AttributeError:
                url = 'openAccessPdf data format unexpected'
        else:
            url = 'openAccessPdf not available'
        return url


    def get_paper_journal(self, title):
        journal = self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).iloc[0]
        volume =  self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('volume') if isinstance(x, dict) else None).iloc[0]
        pages =  self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('pages') if isinstance(x, dict) else None).iloc[0]
        
        journal_edited = None
        
        if (volume is not None) and (pages is None):
            journal_edited = f"{journal}, volume: {volume}"
        if (pages is not None) and (volume is None):
            journal_edited = f'{journal}, pages: {pages}'
        if (volume is not None) and (pages is not None):
            journal_edited = f"{journal}, volume: {volume}, pages: {pages}"
        
        if journal_edited is None:
            journal_edited = journal
        else:
            journal = f'{journal_edited}; {journal}'
        
        return journal

    def get_paper_venue(self, title):
        venue = self.df[self.df['title'] == title]['venue'].values[0]
        return venue
    
    def get_paper_citations(self, title):
        citations = self.df[self.df['title'] == title]['citationCount'].values[0]
        return citations
    
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].values[0]
        return paper_id
    
    def get_paper_authors(self, title, return_list = False):
        authors = self.df[self.df['title'] == title]['authors'].iloc[0]
        author_names = [unidecode(author['name']) for author in authors] 
        if return_list is False: # so we can get first authors
            author_names = ', '.join(author_names)
        return author_names
   
    def get_papers_from_venue(self, venue_name):
        papers = self.df[self.df['venue'] == venue_name]['title'].tolist()
        question = f"What papers were published in the venue {venue_name}?"
        return papers
        # self.add_result(f"What papers were published in the venue {venue_name}?", ', '.join(papers), notes=venue_name)
    
    def get_paper_tldr(self, title):
        tldr = self.df[self.df['title'] == title]['tldr'].tolist()[0]
        return tldr
    
    def get_paper_abstract(self, title):
        abstract = self.df[self.df['title'] == title]['abstract'].tolist()[0]
        return abstract
    
    
    def export_to_df(self):
        return pd.DataFrame(self.results)

    def export_to_csv(self, filename='results.csv'):
        pd.DataFrame(self.results).to_csv(filename, index=False)

In [71]:
samples_per_json = 3
combined_json_df = pd.DataFrame()
combined_results_df = pd.DataFrame()

for json_file in json_files:
    json_file_path = f'{base_dir}/{json_file}'
    analysis = FacultyPublicationAnalysis(json_file_path)
    analysis.get_authorId() # What is the author ID of the faculty member?
    analysis.get_authors_papers() # What are the papers of the faculty member?
    analysis.get_hIndex() # What is the H-index of the faculty member?
    analysis.get_author_CitationCount() # What is the author citation count of the faculty member?
    analysis.get_journals() # What journals has the faculty member published in?
    analysis.get_journal_count() # What are the journals and how many papers has the faculty member published in each?
    analysis.get_fieldsOfStudy() # What are the fields of study of the faculty member?
    analysis.get_OpenAccessCount() # How many papers has this faculty member published in open access journals?
    analysis.get_venues() # What venues has the faculty member published in?
    most_cited_title = analysis.get_most_cited_paper() # What is the most cited paper from the faculty member?

    analysis_df = analysis.df
    combined_json_df = pd.concat([combined_json_df, analysis_df], ignore_index=True)
    
    result_df = analysis.export_to_df()
    
    samples = samples_per_json
    randomly_sampled_papers = analysis_df['title'].sample(min(len(analysis_df), samples), ignore_index=True).values
    new_questions = []
    for paper in randomly_sampled_papers:
        journal = analysis.get_paper_journal(paper)
        question_1 = f'What journal was the paper "{paper}" published in?'
        
        venue = analysis.get_paper_venue(paper)
        question_2 = f'What venue was the paper "{paper}" published in?'
        
        citations = analysis.get_paper_citations(paper)
        question_3 = f'How many citations does the paper "{paper}" have?'
        
        authors = analysis.get_paper_authors(paper)
        question4 = f'Who are the authors of the paper "{paper}"?'
        
        author = authors.split(',')[0]
        question5 = f'Who is the first author of the paper "{paper}"?'
        
        paper_id = analysis.get_paperId(paper)
        question6 = f'What is the paper ID of the paper "{paper}"?'
        question7 = f'What paper has the paper ID {paper_id}?'
        
        tldr = analysis.get_paper_tldr(paper)
        question8 = f"What is the summary/TLDR of the paper '{paper}'?"
        
        abstract = analysis.get_paper_abstract(paper)
        # question9 = f"What is the abstract of the paper '{paper}'?"
        
        new_data = [
        {"Question": question_1, "Answer": journal, "Document": json_file_path, "Notes": ""},
        {"Question": question_2, "Answer": venue, "Document": json_file_path, "Notes": ""},
        {"Question": question_3, "Answer": citations, "Document": json_file_path, "Notes": ""},
        {"Question": question4, "Answer": authors, "Document": json_file_path, "Notes": ""},
        {"Question": question5, "Answer": author, "Document": json_file_path, "Notes": ""},
        {"Question": question6, "Answer": paper_id, "Document": json_file_path, "Notes": ""},
        {"Question": question7, "Answer": paper, "Document": json_file_path, "Notes": ""},
        {"Question": question8, "Answer": tldr, "Document": json_file_path, "Notes": ""},
        # {"Question": question9, "Answer": abstract, "Document": json_file_path, "Notes": ""}
        ]
        
        new_results_df = pd.DataFrame(new_data)
        result_df = pd.concat([result_df, new_results_df], ignore_index=True)
    combined_results_df = pd.concat([combined_results_df, result_df], ignore_index=True)

In [72]:
combined_json_df.to_csv('../data/paper_logs/combined_json_data.csv', index=False)
combined_results_df.to_csv('../data/paper_logs/combined_qa_dataset.csv', index=False)

In [62]:
# Most cited faculy member (overall)
most_cited_faculty = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['profName']
most_citations = combined_json_df['authorCitationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['title']


print(f"Who is the the most cited faculty member?")
print(most_cited_faculty)
print(f"What is the number of citations of the most cited faculty member? {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

Who is the the most cited faculty member?
Yiming Yang
What is the number of citations of the most cited faculty member? 47104
The most cited paper is 'Accelerating Diffusion-based Combinatorial Optimization Solvers by Progressive Distillation'


In [63]:
# Most cited faculy member (2023)
most_cited_faculty = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['profName']
most_citations = combined_json_df['citationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['title']

print(f"Who is the the most cited faculty member i?")
print(most_cited_faculty)
print(f"What is the number of citations of the most cited in 2023 faculty member? {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

Who is the the most cited faculty member i?
Eric P. Xing
What is the number of citations of the most cited in 2023 faculty member? 690
The most cited paper is 'Judging LLM-as-a-judge with MT-Bench and Chatbot Arena'


In [64]:
# most no of publications in 2023, open Access
prof_name_counts = combined_json_df['profName'].value_counts().sort_values(ascending=False)
# len(prof_name_counts)
prof_name_counts

profName
Shinji Watanabe            58
Graham Neubig              27
Yiming Yang                25
Bhiksha Raj                20
Louis-Philippe Morency     16
Yulia Tsvetkov             16
Maarten Sap                13
Emma Strubell              10
Brian MacWhinney           10
Chenyan Xiong              10
Alexander Waibel            9
Taylor Berg-Kirkpatrick     9
Yonatan Bisk                8
Jeffrey Bigham              8
Mona Diab                   8
Lei Li                      8
Alexander Rudnicky          7
David R Mortensen           7
Rita Singh                  7
Malihe Alikhani             7
Lu Jiang                    6
Alon Lavie                  6
Daniel Fried                6
Alexander Hauptmann         5
Jamie Callan                5
Carolyn Rose                5
Matt Gormley                4
Norman Sadeh                4
Daphne Ippolito             4
Eric P. Xing                4
William Cohen               4
Eric Nyberg                 4
Roni Rosenfeld              3
F

In [65]:
print('Which faculty member has the most publications in open access journals in 2023?')
print(prof_name_counts.index[0])
print('Which faculty member has the second most publications in open access journals in 2023?')
print(prof_name_counts.index[1])
print('Which faculty member has the third most publications in open access journals in 2023?')
print(prof_name_counts.index[2])
print('Which faculty member has the least publications in open access journals in 2023?')
print(prof_name_counts.index[-1])

Which faculty member has the most publications in open access journals in 2023?
Shinji Watanabe
Which faculty member has the second most publications in open access journals in 2023?
Graham Neubig
Which faculty member has the third most publications in open access journals in 2023?
Yiming Yang
Which faculty member has the least publications in open access journals in 2023?
Scott Fahlman


# Initial Exploration

In [66]:
base_dir = 'data/paper_jsons'

In [15]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Lavie_1784914.json', 'Alexander Hauptmann_7661726.json', 'Alexander Hauptmann_7661726_145788702.json', 'Alexander I. Rudnicky_1783635_3156164.json', 'Alexander Waibel_2064429921_1724972.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rosé_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_144628574_2139787803.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_286

In [13]:
df = pd.read_json(f'{base_dir}/{json_files[1]}')
df.columns

Index(['profName', 'authorId', 'authorName', 'authorUrl', 'authorHIndex',
       'authorAffiliations', 'authorPaperCount', 'authorCitationCount',
       'paperId', 'externalIds', 'url', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'citationCount', 'influentialCitationCount',
       'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 'journal', 'authors'],
      dtype='object')

In [36]:
import json
with open(f'{base_dir}/{json_files[1]}') as f:
    data = json.load(f)
data

[{'profName': 'Carolyn Rose',
  'authorId': '7661726',
  'authorName': 'Alexander Hauptmann',
  'authorUrl': 'https://www.semanticscholar.org/author/7661726',
  'authorHIndex': 81,
  'authorAffiliations': [],
  'authorPaperCount': 543,
  'authorCitationCount': 25325,
  'paperId': '72cce47fd053bf916314d89a8174726c58c05e02',
  'externalIds': {'DBLP': 'conf/acl/WenXHH23',
   'DOI': '10.18653/v1/2023.findings-acl.198',
   'CorpusId': 259859135},
  'url': 'https://www.semanticscholar.org/paper/72cce47fd053bf916314d89a8174726c58c05e02',
  'title': 'Towards Open-Domain Twitter User Profile Inference',
  'abstract': ',',
  'venue': 'Annual Meeting of the Association for Computational Linguistics',
  'year': 2023,
  'referenceCount': 61,
  'citationCount': 0,
  'influentialCitationCount': 0,
  'isOpenAccess': True,
  'openAccessPdf': {'url': 'https://aclanthology.org/2023.findings-acl.198.pdf',
   'status': None},
  'fieldsOfStudy': ['Computer Science'],
  'journal': {'pages': '3172-3188'},
  '

In [37]:
# https://api.semanticscholar.org/graph/v1/paper/7848d4b4e6ba0897a85cebb6467e94eb0b60d583?fields=tldr
API_KEY = os.environ.get('S2APIKEY')
import requests
if not API_KEY:
    raise EnvironmentError("S2_API_KEY environment variable not set.")
# Set up the headers with the API key
headers = {
    "x-api-key": API_KEY
}
fields = 'tldr'
id = '27ca2d927421035e10b48c96a96db32224f1f8e6'
response = requests.get( # following their git modules
    f'https://api.semanticscholar.org/graph/v1/paper/{id}?',
    headers=headers,
    params={'fields': fields}
)

tldr = response.json() # '27ca2d927421035e10b48c96a96db32224f1f8e6'


In [38]:
tldr.keys(), tldr.values()
tldr['tldr']['text']

In [39]:
data[0]['tldr'] = tldr['tldr']['text']
data

[{'profName': 'Carolyn Rose',
  'authorId': '7661726',
  'authorName': 'Alexander Hauptmann',
  'authorUrl': 'https://www.semanticscholar.org/author/7661726',
  'authorHIndex': 81,
  'authorAffiliations': [],
  'authorPaperCount': 543,
  'authorCitationCount': 25325,
  'paperId': '72cce47fd053bf916314d89a8174726c58c05e02',
  'externalIds': {'DBLP': 'conf/acl/WenXHH23',
   'DOI': '10.18653/v1/2023.findings-acl.198',
   'CorpusId': 259859135},
  'url': 'https://www.semanticscholar.org/paper/72cce47fd053bf916314d89a8174726c58c05e02',
  'title': 'Towards Open-Domain Twitter User Profile Inference',
  'abstract': ',',
  'venue': 'Annual Meeting of the Association for Computational Linguistics',
  'year': 2023,
  'referenceCount': 61,
  'citationCount': 0,
  'influentialCitationCount': 0,
  'isOpenAccess': True,
  'openAccessPdf': {'url': 'https://aclanthology.org/2023.findings-acl.198.pdf',
   'status': None},
  'fieldsOfStudy': ['Computer Science'],
  'journal': {'pages': '3172-3188'},
  '

In [None]:
# what is the author id of this professor?
set(df.authorId)

In [None]:
# how to get the url 
list(df.openAccessPdf[0].values())[0], df.url[0]

In [None]:
# What is the H-index of Professor X?
h_index = df.authorHIndex.max()
h_index

In [None]:
# What is the most cited paper from this faculty member?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_paper

In [None]:
# get url from title
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
url = df[df['title'] == title]['openAccessPdf']
url.values[0]['url']

In [None]:
# get journal from title
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
journal = df[df['title'] == title]['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).iloc[0]
print(journal)

journal = df[df['title'] == title]['venue'].values[0]
print(journal)

In [None]:
# What is the most cited paper from this faculty member and its URL?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_url = list(df.loc[df['citationCount'].idxmax()]['openAccessPdf'].values())[0]
most_cited_paper, most_cited_url

In [None]:
# Who are the authors of the most cited paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names

In [None]:
# who is the first author of this paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names[0]

In [None]:
# Who are the authors of the paper [title]?
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
author_names = df[df['title']==title]['authors']
author_names = [author['name'] for author in most_cited_authors]
test = ', '.join(author_names)
test

In [None]:
total_citations = df['citationCount'].sum()
total_citations

In [None]:
# Filter the DataFrame to exclude 'arXiv.org' from venues and get the unique venues
conferences_last_year = df[df['venue'] != 'arXiv.org']['venue'].unique()
conferences_string = ', '.join(conferences_last_year)
conferences_string

In [None]:
# How many papers has this faculty member published in open access journals?
open_access_papers = df[df['isOpenAccess'] == True].shape[0]
open_access_papers

In [None]:
journals_filtered = [journal for journal in df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
journals_string = ', '.join(journals_filtered)
journals_string

In [None]:
# Which venue was this paper published in?
paper_title = 'DocumentNet: Bridging the Data Gap in Document Pre-training'
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

In [None]:
df.title[1]

In [None]:
df.title

In [None]:
# Which venue was this paper published in?
venue = df[df['title'] == df.title[2]]['venue'].values[0]
venue

In [None]:
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
# Which venue was this paper published in?
paper_title = title
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

In [None]:

samples = 3

randomly_sampled_papers = df['title'].sample(min(len(df), samples), ignore_index=True).values
for title in randomly_sampled_papers:
    journal = analysis.get_paper_journal(title)
    question = f'What journal was the paper "{title}" published in?'
    
    venue = analysis.get_paper_venue(title)
    citations = analysis.get_paper_citations(title)
    authors = analysis.get_paper_authors(title)
    author = authors.split(',')[0]
    