In [23]:
import sys
import pandas as pd
import numpy as np
import os

### Questions:
- what is the author id of this professor?
- what is the paperId of this paper?
- what are the papers of this author?
- what papers were published in this in this venue (conference)?
- What is the H- index of Professor X?
- What is the author citation count of Professor X? (authorCitationCount)
- What is the most cited paper from this faculty member?
- What is the most cited paper from this faculty member and its URL?
- Who are the authors of the most cited paper?
- who is the first author of this given paper (user gives title)?
- How many papers has this faculty member published in open access journals?
- What are the journals that this faculty member has published in?
- What are the journals that this faculty member has published in, and how many papers in each journal?
- What are the fields of study of this faculty member? (fieldsOfStudy)
- Which venue was this paper published in?


In [24]:
base_dir = 'data/paper_jsons'

In [25]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Lavie_1784914.json', 'Alexander Hauptmann_7661726_145788702.json', 'Alexander I. Rudnicky_1783635_3156164.json', 'Alexander Waibel_2064429921_1724972.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rosé_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_144628574_2139787803.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_2869551.json', 'Mona T. Diab_1700007_21

In [26]:
# Class to get questions:
class FacultyPublicationAnalysis:
    def __init__(self, json_path):
        self.df = pd.read_json(json_path)
        self.json_path = json_path
        self.df = self.df
        self.prof_name = self.df['profName'].iloc[0]  
        self.results = []

    def add_result(self, question, answer, document_name="", notes=""):
        self.results.append({
            "Question": question,
            "Answer": answer,
            "Document":  self.json_path,
            "Notes": notes
        })

    def get_authorId(self):
        author_id = set(self.df['authorId'])
        if len(author_id) >1:
            author_id = ', '.join([str(x) for x in author_id])
        else:
            author_id = author_id.pop()
        self.add_result(f"What is the author ID of {self.prof_name}?", str(author_id))

    def get_hIndex(self):
        h_index = set(self.df['authorHIndex'])
        if len(h_index) >1:
            h_index = ', '.join([str(x) for x in h_index])
        else:
            h_index = h_index.pop()
        self.add_result(f"What is the H-index of {self.prof_name}?", str(h_index))
        
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].iloc[0]
        self.add_result(f"What is the paper ID of '{title}'?", str(paper_id), title)

    def get_OpenAccessCount(self):
        # How many papers has this faculty member published in open access journals?
        open_access_papers = self.df[self.df['isOpenAccess'] == True].shape[0]
        self.add_result(f"How many papers has {self.prof_name} published in open access journals?", str(open_access_papers))

    def get_authors_papers(self):
        papers = self.df['title'].tolist()
        self.add_result(f"What are the papers of {self.prof_name}?", ', '.join(papers))


    def get_author_CitationCount(self):
        citation_count = self.df['authorCitationCount'].max()
        self.add_result(f"What is the author citation count of {self.prof_name}?", str(citation_count))

    def get_journals(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna().unique()
        journals_string = ', '.join(journals)
        self.add_result(f"What journals has {self.prof_name} published in?", journals_string)

    def get_journal_count(self):
        journals = self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).dropna()
        journal_counts = journals.value_counts().to_dict()
        self.add_result(f"What are the journals and how many papers has {self.prof_name} published in each?", str(journal_counts))

    def get_venues(self):
        journals_filtered = [journal for journal in self.df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
        journals_string = ', '.join(journals_filtered)
        venues = self.df['venue'].unique()
        self.add_result(f"What venues has {self.prof_name} published in?", ', '.join(venues))
    
    
    def get_fieldsOfStudy(self):
        fields = self.df['fieldsOfStudy'].explode().dropna().unique()
        self.add_result(f"What are the fields of study of {self.prof_name}?", ', '.join(fields))

    def get_most_cited_paper(self):
        most_cited = self.df.loc[self.df['citationCount'].idxmax()]
        self.add_result(f"What is the most cited paper from {self.prof_name}?", most_cited['title'])
        url = self.get_pdfurl(most_cited['title'])
        self.add_result(f"What is the url of the most cited paper from {self.prof_name}?", url)
        authors = self.get_paper_authors(most_cited['title'])
        self.add_result(f"Who are the authors of the most cited paper from {self.prof_name}?", authors)
        return most_cited['title']
    
    def get_pdfurl(self, title): # asked ChatGpt because was getting none errr before and my handling was not working
        pdf_data = self.df[self.df['title'] == title]['openAccessPdf']
        if not pdf_data.empty and pd.notna(pdf_data.iloc[0]):
            try:
                url = pdf_data.iloc[0].get('url', 'openAccessPdf not available')
            except AttributeError:
                url = 'openAccessPdf data format unexpected'
        else:
            url = 'openAccessPdf not available'
        return url


    def get_paper_journal(self, title):
        journal = self.df[self.df['title'] == title]['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).iloc[0]
        return journal

    def get_paper_venue(self, title):
        venue = self.df[self.df['title'] == title]['venue'].values[0]
        return venue
    
    def get_paper_citations(self, title):
        citations = self.df[self.df['title'] == title]['citationCount'].values[0]
        return citations
    
    def get_paperId(self, title):
        paper_id = self.df[self.df['title'] == title]['paperId'].values[0]
        return paper_id
    
    def get_paper_authors(self, title, return_list = False):
        authors = self.df[self.df['title'] == title]['authors'].iloc[0]
        author_names = [author['name'] for author in authors] 
        if return_list is False: # so we can get first authors
            author_names = ', '.join(author_names)
        return author_names
   
    def get_papers_from_venue(self, venue_name):
        papers = self.df[self.df['venue'] == venue_name]['title'].tolist()
        question = f"What papers were published in the venue {venue_name}?"
        return papers
        # self.add_result(f"What papers were published in the venue {venue_name}?", ', '.join(papers), notes=venue_name)
    
    def export_to_df(self):
        return pd.DataFrame(self.results)

    def export_to_csv(self, filename='results.csv'):
        pd.DataFrame(self.results).to_csv(filename, index=False)

In [27]:
samples_per_json = 3
combined_json_df = pd.DataFrame()
combined_results_df = pd.DataFrame()

for json_file in json_files:
    analysis = FacultyPublicationAnalysis(f'{base_dir}/{json_file}')
    analysis.get_authorId() # What is the author ID of the faculty member?
    analysis.get_authors_papers() # What are the papers of the faculty member?
    analysis.get_hIndex() # What is the H-index of the faculty member?
    analysis.get_author_CitationCount() # What is the author citation count of the faculty member?
    analysis.get_journals() # What journals has the faculty member published in?
    analysis.get_journal_count() # What are the journals and how many papers has the faculty member published in each?
    analysis.get_fieldsOfStudy() # What are the fields of study of the faculty member?
    analysis.get_OpenAccessCount() # How many papers has this faculty member published in open access journals?
    analysis.get_venues() # What venues has the faculty member published in?
    most_cited_title = analysis.get_most_cited_paper() # What is the most cited paper from the faculty member?

    analysis_df = analysis.df
    combined_json_df = pd.concat([combined_json_df, analysis_df], ignore_index=True)
    
    result_df = analysis.export_to_df()
    
    samples = samples_per_json
    randomly_sampled_papers = analysis_df['title'].sample(min(len(analysis_df), samples), ignore_index=True).values
    new_questions = []
    for paper in randomly_sampled_papers:
        journal = analysis.get_paper_journal(paper)
        question_1 = f'What journal was the paper "{paper}" published in?'
        
        venue = analysis.get_paper_venue(paper)
        question_2 = f'What venue was the paper "{paper}" published in?'
        
        citations = analysis.get_paper_citations(paper)
        question_3 = f'How many citations does the paper "{paper}" have?'
        
        authors = analysis.get_paper_authors(paper)
        question4 = f'Who are the authors of the paper "{paper}"?'
        
        author = authors.split(',')[0]
        question5 = f'Who is the first author of the paper "{paper}"?'
        
        paper_id = analysis.get_paperId(paper)
        question6 = f'What is the paper ID of the paper "{paper}"?'
        question7 = f'What paper has the paper ID {paper_id}?'
        
        new_data = [
        {"Question": question_1, "Answer": journal, "Document": json_file, "Notes": ""},
        {"Question": question_2, "Answer": venue, "Document": json_file, "Notes": ""},
        {"Question": question_3, "Answer": citations, "Document": json_file, "Notes": ""},
        {"Question": question4, "Answer": authors, "Document": json_file, "Notes": ""},
        {"Question": question5, "Answer": author, "Document": json_file, "Notes": ""},
        {"Question": question6, "Answer": paper_id, "Document": json_file, "Notes": ""},
        {"Question": question7, "Answer": paper, "Document": json_file, "Notes": ""}
        ]
        new_results_df = pd.DataFrame(new_data)
        result_df = pd.concat([result_df, new_results_df], ignore_index=True)
    combined_results_df = pd.concat([combined_results_df, result_df], ignore_index=True)

In [28]:
combined_json_df.to_csv('data/paper_logs/combined_json_data.csv', index=False)
combined_results_df.to_csv('data/paper_logs/combined_qa_dataset.csv', index=False)

In [32]:
# Most cited faculy member (overall)
most_cited_faculty = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['profName']
most_citations = combined_json_df['authorCitationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['authorCitationCount'].idxmax()]['title']

print(f"The most cited faculty member is {most_cited_faculty}")
print(f"Number of citations: {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

The most cited faculty member is Eric P. Xing
Number of citations: 46108
The most cited paper is 'Identification of Nonlinear Latent Hierarchical Models'


In [33]:
# Most cited faculy member (2023)
most_cited_faculty = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['profName']
most_citations = combined_json_df['citationCount'].max()
most_cited_paper = combined_json_df.loc[combined_json_df['citationCount'].idxmax()]['title']

print(f"The most cited faculty member is {most_cited_faculty}")
print(f"Number of citations: {most_citations}")
print(f"The most cited paper is '{most_cited_paper}'")

The most cited faculty member is Eric P. Xing
Number of citations: 680
The most cited paper is 'Judging LLM-as-a-judge with MT-Bench and Chatbot Arena'


In [41]:
# most no of publications in 2023, open Access
prof_name_counts = combined_json_df['profName'].value_counts().sort_values(ascending=False)
# len(prof_name_counts)
prof_name_counts

profName
Shinji Watanabe            58
Graham Neubig              27
Yiming Yang                24
Chenyan Xiong              20
Bhiksha Raj                20
Louis-Philippe Morency     16
Yulia Tsvetkov             16
Maarten Sap                13
Emma Strubell              10
Brian MacWhinney           10
Alexander Waibel            9
Taylor Berg-Kirkpatrick     9
Yonatan Bisk                8
Jeffrey Bigham              8
Mona Diab                   8
Lei Li                      8
Malihe Alikhani             7
David R Mortensen           7
Alexander Rudnicky          7
Rita Singh                  7
Alon Lavie                  6
Daniel Fried                6
Lu Jiang                    6
Jamie Callan                5
Alexander Hauptmann         5
Carolyn Rose                5
Matt Gormley                4
Norman Sadeh                4
Daphne Ippolito             4
Eric P. Xing                4
William Cohen               4
Eric Nyberg                 4
Justine Cassell             3
F

In [42]:
print('Which faculty member has the most publications in open access journals in 2023?')
print(prof_name_counts.index[0])
print('Which faculty member has the second most publications in open access journals in 2023?')
print(prof_name_counts.index[1])
print('Which faculty member has the most publications in open access journals in 2023?')
print(prof_name_counts.index[2])
print('Which faculty member has the least publications in open access journals in 2023?')
print(prof_name_counts.index[-1])

Which faculty member has the most publications in open access journals in 2023?
Shinji Watanabe
Which faculty member has the second most publications in open access journals in 2023?
Graham Neubig
Which faculty member has the most publications in open access journals in 2023?
Yiming Yang
Which faculty member has the least publications in open access journals in 2023?
Scott Fahlman


# Initial Exploration

In [43]:
base_dir = 'data/paper_jsons'

In [44]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Lavie_1784914.json', 'Alexander Hauptmann_7661726_145788702.json', 'Alexander I. Rudnicky_1783635_3156164.json', 'Alexander Waibel_2064429921_1724972.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rosé_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_144628574_2139787803.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_2869551.json', 'Mona T. Diab_1700007_21

In [45]:
df = pd.read_json(f'{base_dir}/{json_files[1]}')
df.columns

Index(['profName', 'authorId', 'authorName', 'authorUrl', 'authorHIndex',
       'authorAffiliations', 'authorPaperCount', 'authorCitationCount',
       'paperId', 'externalIds', 'url', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'citationCount', 'influentialCitationCount',
       'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 'journal', 'authors'],
      dtype='object')

In [48]:
# what is the author id of this professor?
set(df.authorId)

{7661726, 145788702}

In [13]:
# how to get the url 
list(df.openAccessPdf[0].values())[0], df.url[0]

('https://aclanthology.org/2023.findings-acl.198.pdf',
 'https://www.semanticscholar.org/paper/72cce47fd053bf916314d89a8174726c58c05e02')

In [14]:
# What is the H-index of Professor X?
h_index = df.authorHIndex.max()
h_index

81

In [15]:
# What is the most cited paper from this faculty member?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_paper

'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'

In [85]:
# get url from title
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
url = df[df['title'] == title]['openAccessPdf']
url.values[0]['url']

'http://arxiv.org/pdf/2306.17842'

In [91]:
# get journal from title
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
journal = df[df['title'] == title]['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).iloc[0]
print(journal)

journal = df[df['title'] == title]['venue'].values[0]
print(journal)

ArXiv
arXiv.org


In [16]:
# What is the most cited paper from this faculty member and its URL?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_url = list(df.loc[df['citationCount'].idxmax()]['openAccessPdf'].values())[0]
most_cited_paper, most_cited_url

('SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs',
 'http://arxiv.org/pdf/2306.17842')

In [17]:
# Who are the authors of the most cited paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names

['Lijun Yu',
 'Yong Cheng',
 'Zhiruo Wang',
 'Vivek Kumar',
 'Wolfgang Macherey',
 'Yanping Huang',
 'David A. Ross',
 'Irfan Essa',
 'Yonatan Bisk',
 'Ming Yang',
 'K. Murphy',
 'A. Hauptmann',
 'Lu Jiang']

In [18]:
# who is the first author of this paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names[0]

'Lijun Yu'

In [41]:
# Who are the authors of the paper [title]?
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
author_names = df[df['title']==title]['authors']
author_names = [author['name'] for author in most_cited_authors]
test = ', '.join(author_names)
test

'Lijun Yu, Yong Cheng, Zhiruo Wang, Vivek Kumar, Wolfgang Macherey, Yanping Huang, David A. Ross, Irfan Essa, Yonatan Bisk, Ming Yang, K. Murphy, A. Hauptmann, Lu Jiang'

In [20]:
total_citations = df['citationCount'].sum()
total_citations

15

In [21]:
# Filter the DataFrame to exclude 'arXiv.org' from venues and get the unique venues
conferences_last_year = df[df['venue'] != 'arXiv.org']['venue'].unique()
conferences_string = ', '.join(conferences_last_year)
conferences_string

'Annual Meeting of the Association for Computational Linguistics, Computer Vision and Pattern Recognition, Conference on Empirical Methods in Natural Language Processing'

In [22]:
# How many papers has this faculty member published in open access journals?
open_access_papers = df[df['isOpenAccess'] == True].shape[0]
open_access_papers

5

In [23]:
journals_filtered = [journal for journal in df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
journals_string = ', '.join(journals_filtered)
journals_string

'ArXiv, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)'

In [24]:
# Which venue was this paper published in?
paper_title = 'DocumentNet: Bridging the Data Gap in Document Pre-training'
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

'Conference on Empirical Methods in Natural Language Processing'

In [26]:
df.title[1]

'Zero-Shot and Few-Shot Stance Detection on Varied Topics via Conditional Generation'

In [32]:
df.title

0    Towards Open-Domain Twitter User Profile Infer...
1    Zero-Shot and Few-Shot Stance Detection on Var...
2    SPAE: Semantic Pyramid AutoEncoder for Multimo...
3    STMT: A Spatial-Temporal Mesh Transformer for ...
4    DocumentNet: Bridging the Data Gap in Document...
Name: title, dtype: object

In [39]:
# Which venue was this paper published in?
venue = df[df['title'] == df.title[2]]['venue'].values[0]
venue

'arXiv.org'

In [34]:
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
# Which venue was this paper published in?
paper_title = title
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

'arXiv.org'

In [170]:

samples = 3

randomly_sampled_papers = df['title'].sample(min(len(df), samples), ignore_index=True).values
for title in randomly_sampled_papers:
    journal = analysis.get_paper_journal(title)
    question = f'What journal was the paper "{title}" published in?'
    
    venue = analysis.get_paper_venue(title)
    citations = analysis.get_paper_citations(title)
    authors = analysis.get_paper_authors(title)
    author = authors.split(',')[0]
    