In [145]:
import requests
import json

In [146]:
headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7,ar;q=0.6",
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "X-S2-Client": "webapp-browser",
    "X-S2-Ui-Version": "452cc270c0d8e73927dd44144435f75b41e2c43a"
}

In [147]:
def search_paper(title):
    # Define the API endpoint
    url = "https://www.semanticscholar.org/api/1/search"

    # Define the data to be sent in the POST request
    data = {
        "queryString": title,
        "page": 1,
        "authors": [],
        "coAuthors": [],
        "cues": ["CitedByLibraryPaperCue"],
        "fieldsOfStudy": [],
        "getQuerySuggestions": False,
        "hydrateWithDdb": True,
        "includeBadges": True,
        "includePdfVisibility": False,
        "includeTldrs": True,
        "pageSize": 10,
        "performTitleMatch": True,
        "requireViewablePdf": False,
        "sort": "relevance",
        "useFallbackRankerService": False,
        "useFallbackSearchCluster": False,
        "venues": [],
        "yearFilter": None
    }

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(data))

    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")

    response_data = response.json()

   # Filter results by title match and find the result with the highest numReferences
    max_references = -1
    paper_id_with_max_references = None

    for result in response_data['results']:
        if result['title']['text'] == title:
            current_references = result['citationStats']['numReferences'] if 'citationStats' in result else 0
            if current_references > max_references:
                max_references = current_references
                paper_id_with_max_references = result['id']

    if paper_id_with_max_references is None:
        raise Exception("No matching papers found with the specified title.")

    return paper_id_with_max_references

In [148]:
def get_detail(seed_id):
    # Define the API endpoint
    url = f"https://api.semanticscholar.org/graph/v1/paper/{seed_id}"

    # Specify the fields we want in the response
    params = {
        "fields": "title,venue,year,authors,abstract,citationCount,externalIds,url"
    }

    # Make the GET request
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
   
    data = response.json()
    
    # Extract required information
    result = {
        "paperId": data["paperId"],
        "title": data["title"],
        "author": data["authors"][0]["name"] if data["authors"] else None,
        "venue": data["venue"],
        "year": data["year"],
        "citationCount": data["citationCount"],
        "url": f'https://arxiv.org/abs/{data["externalIds"]["ArXiv"]}' if "ArXiv" in data["externalIds"] else data["url"],
        "abstract": data["abstract"]
    }

    return result

In [149]:
import os
import csv

def write_node_data(data, file_name, fields):
    with open(file_name, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        with open(file_name, 'r', newline='', encoding='utf-8') as readfile:
            reader = csv.DictReader(readfile)
            if any(row['paperId'] == data['paperId'] for row in reader):
                return
        writer.writerow(data)

In [150]:
def get_references(seed_id):
    url = f"https://api.semanticscholar.org/graph/v1/paper/{seed_id}/references"
    params = {
        "fields": "contexts,intents,isInfluential,paperId,year,abstract",
        "limit": "1000"
    }
    
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    data = response.json()['data']
    
    # year가 2020 이후이고, abstract에 'robot'이 포함되며, isInfluential이 True인 것만 필터링
    filtered_data = [entry for entry in data 
                     if entry['citedPaper']['year'] is not None 
                     and entry['citedPaper']['year'] >= 2020
                     and entry['citedPaper'].get('abstract')
                     and 'robot' in entry['citedPaper']['abstract'].lower()
                     and entry['isInfluential']]

    return filtered_data

In [151]:
import os 
import csv

def write_edge_data(data, file_name, fields):                                              
    with open(file_name, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writerow(data)

In [155]:
import csv

# CSV 파일에서 'title' 컬럼의 데이터를 읽어와서 seed_papers 초기화
def get_seeds(file_name):
    titles = []
    with open(file_name, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            titles.append(row['title'])
    return titles

In [154]:
SEED_FILE_NAME = 'seeds.csv'

NODE_FILE_NAME = 'papers.csv'
NODE_FILE_FIELDS = ['paperId', 'title', 'author', 'venue', 'year', 'citationCount',  'url', 'abstract']

EDGE_FILE_NAME = 'references.csv'
EDGE_FILE_FIELDS=['from_id', 'to_id', 'isInfluential', 'intents', 'contexts']


with open(NODE_FILE_NAME, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=NODE_FILE_FIELDS)
    writer.writeheader()

with open(EDGE_FILE_NAME, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=EDGE_FILE_FIELDS)
    writer.writeheader() 

seed_ids = []
processed_ids = set()  # 이미 처리된 id를 저장할 집합

for seed_paper in get_seeds(SEED_FILE_NAME):
    seed_id = search_paper(seed_paper)
    seed_ids.append(seed_id)
    processed_ids.add(seed_id)

while seed_ids:    
    seed_id = seed_ids.pop(0)
    detail = get_detail(seed_id)                                    
    write_node_data(detail, NODE_FILE_NAME, NODE_FILE_FIELDS) 
    
    refs = get_references(seed_id)
    for ref in refs:
        ref_id = ref['citedPaper']['paperId']
        # ref의 paperId가 seed_papers에 없고, 처리되지 않은 id인 경우 추가
        if ref_id not in seed_papers and ref_id not in processed_ids:
            seed_papers.append(ref_id)
            seed_ids.append(ref_id)
            processed_ids.add(ref_id)

        edge_data = {
            'from_id': seed_id,
            'to_id': ref_id,
            'isInfluential': ref['isInfluential'],
            'intents': ','.join(ref['intents']),
            'contexts': ','.join(ref['contexts'])
        }

        write_edge_data(edge_data, EDGE_FILE_NAME, EDGE_FILE_FIELDS)
        
    print(f"Remaining seeds: {len(seed_ids)}")

Remaining seeds: 4
Remaining seeds: 3
Remaining seeds: 7
Remaining seeds: 7
Remaining seeds: 8
Remaining seeds: 7
Remaining seeds: 9
Remaining seeds: 8
Remaining seeds: 7
Remaining seeds: 6
Remaining seeds: 6
Remaining seeds: 5
Remaining seeds: 4
Remaining seeds: 3
Remaining seeds: 2
Remaining seeds: 1
Remaining seeds: 1
Remaining seeds: 0
