In [9]:
import csv
import requests
import json
import sys
from requests import Session
from typing import Generator, Union
from requests import Session
import subprocess
import pandas as pd
import logging 
import numpy as np
import os

In [10]:
base_dir = 'data/paper_jsons'

In [4]:
json_files = [pos_json for pos_json in os.listdir(base_dir) if pos_json.endswith('.json')]
json_files.sort()
print(json_files)
print(len(json_files))

['A. Lavie_1784914.json', 'Alexander Hauptmann_7661726_145788702.json', 'Alexander I. Rudnicky_1783635_3156164.json', 'Alexander Waibel_2064429921_1724972.json', 'B. MacWhinney_2414040.json', 'B. Raj_1681921.json', 'C. Rosé_35959897.json', 'Chenyan Xiong_144628574.json', 'Chenyan Xiong_144628574_2139787803.json', 'Chenyan Xiong_2139787803.json', 'Daniel Fried_47070750.json', 'Daphne Ippolito_7975935.json', 'David R. Mortensen_3407646.json', 'E. Xing_143977260.json', 'Emma Strubell_2268272.json', 'Eric Nyberg_144287919.json', 'Fernando Diaz_145472333.json', 'Graham Neubig_1700325.json', 'Jamie Callan_144987107.json', 'Jeffrey P. Bigham_1744846.json', 'Justine Cassell_145431806.json', 'Lei Li_143900005.json', 'Lori S. Levin_1686960.json', 'Louis-Philippe Morency_49933077.json', 'Lu Jiang_39978626.json', 'M. Ganapathiraju_32747279.json', 'Maarten Sap_2729164.json', 'Malihe Alikhani_2715920.json', 'Matthew R. Gormley_1762110.json', 'Matthias Grabmair_2869551.json', 'Mona T. Diab_1700007_21

In [11]:
df = pd.read_json(f'{base_dir}/{json_files[1]}')
df.columns

Index(['profName', 'authorId', 'authorName', 'authorUrl', 'authorHIndex',
       'authorAffiliations', 'authorPaperCount', 'authorCitationCount',
       'paperId', 'externalIds', 'url', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'citationCount', 'influentialCitationCount',
       'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 'journal', 'authors'],
      dtype='object')

In [12]:
# what is the author id of this professor?
set(df.authorId)

{7661726, 145788702}

In [13]:
# how to get the url 
list(df.openAccessPdf[0].values())[0], df.url[0]

('https://aclanthology.org/2023.findings-acl.198.pdf',
 'https://www.semanticscholar.org/paper/72cce47fd053bf916314d89a8174726c58c05e02')

In [14]:
# What is the H-index of Professor X?
h_index = df.authorHIndex.max()
h_index

81

In [15]:
# What is the most cited paper from this faculty member?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_paper

'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'

In [16]:
# What is the most cited paper from this faculty member and its URL?
most_cited_paper = df.loc[df['citationCount'].idxmax()]['title']
most_cited_url = list(df.loc[df['citationCount'].idxmax()]['openAccessPdf'].values())[0]
most_cited_paper, most_cited_url

('SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs',
 'http://arxiv.org/pdf/2306.17842')

In [17]:
# Who are the authors of the most cited paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names

['Lijun Yu',
 'Yong Cheng',
 'Zhiruo Wang',
 'Vivek Kumar',
 'Wolfgang Macherey',
 'Yanping Huang',
 'David A. Ross',
 'Irfan Essa',
 'Yonatan Bisk',
 'Ming Yang',
 'K. Murphy',
 'A. Hauptmann',
 'Lu Jiang']

In [18]:
# who is the first author of this paper?
most_cited_authors = df.loc[df['citationCount'].idxmax()]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names[0]

'Lijun Yu'

In [19]:
# Who are the authors of the paper [title]?
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
author_names = df[df['title']==title]['authors']
author_names = [author['name'] for author in most_cited_authors]
author_names


['Lijun Yu',
 'Yong Cheng',
 'Zhiruo Wang',
 'Vivek Kumar',
 'Wolfgang Macherey',
 'Yanping Huang',
 'David A. Ross',
 'Irfan Essa',
 'Yonatan Bisk',
 'Ming Yang',
 'K. Murphy',
 'A. Hauptmann',
 'Lu Jiang']

In [20]:
total_citations = df['citationCount'].sum()
total_citations

15

In [21]:
# Filter the DataFrame to exclude 'arXiv.org' from venues and get the unique venues
conferences_last_year = df[df['venue'] != 'arXiv.org']['venue'].unique()
conferences_string = ', '.join(conferences_last_year)
conferences_string

'Annual Meeting of the Association for Computational Linguistics, Computer Vision and Pattern Recognition, Conference on Empirical Methods in Natural Language Processing'

In [22]:
# How many papers has this faculty member published in open access journals?
open_access_papers = df[df['isOpenAccess'] == True].shape[0]
open_access_papers

5

In [23]:
# Since the journals array contains None, we need to filter out None values before joining
# Filter out None values from the journals array and then join
journals_filtered = [journal for journal in df['journal'].apply(lambda x: x.get('name') if isinstance(x, dict) else None).unique() if journal is not None]
journals_string = ', '.join(journals_filtered)
journals_string

'ArXiv, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)'

In [24]:
# Which venue was this paper published in?
paper_title = 'DocumentNet: Bridging the Data Gap in Document Pre-training'
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

'Conference on Empirical Methods in Natural Language Processing'

In [26]:
df.title[1]

'Zero-Shot and Few-Shot Stance Detection on Varied Topics via Conditional Generation'

In [32]:
df.title

0    Towards Open-Domain Twitter User Profile Infer...
1    Zero-Shot and Few-Shot Stance Detection on Var...
2    SPAE: Semantic Pyramid AutoEncoder for Multimo...
3    STMT: A Spatial-Temporal Mesh Transformer for ...
4    DocumentNet: Bridging the Data Gap in Document...
Name: title, dtype: object

In [39]:
# Which venue was this paper published in?
venue = df[df['title'] == df.title[2]]['venue'].values[0]
venue

'arXiv.org'

In [34]:
title = 'SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs'
# Which venue was this paper published in?
paper_title = title
venue = df[df['title'] == paper_title]['venue'].values[0]
venue

'arXiv.org'