In [None]:
import json

def parse_ris_record(lines, paper_id):
	record = {
		"id": paper_id,
	}

	authors = []
	pages = []

	for line in lines:
		line = line.strip()
		if not line:
			continue
		
		if line.startswith("T1  -"):
			record["title"] = line.split(" - ", 1)[1]

		elif line.startswith("AU  -"):
			author = line.split(" - ", 1)[1]
			lname, fname = author.split(", ")
			authors.append(f"{fname} {lname}")

		elif line.startswith("JO  -"):
			record["venue"] = line.split(" - ", 1)[1]

		# Primary year of publication
		elif line.startswith("PY  -"):
			record["year"] = int(line.split(" - ", 1)[1])

		elif line.startswith("PB  -"):
			record["publisher"] = line.split(" - ", 1)[1]
			
		elif line.startswith("SP  -"):
			pages.append(line.split(" - ", 1)[1])
			# record["start_page"] = line.split(" - ", 1)[1]

		elif line.startswith("EP  -"):
			pages.append(line.split(" - ", 1)[1])
			# record["end_page"] = line.split(" - ", 1)[1]
			
		elif line.startswith("VL  -"):
			record["volume"] = line.split(" - ", 1)[1]

		elif line.startswith("IS  -"):
			record["issue"] = line.split(" - ", 1)[1]

		elif line.startswith("SN  -"):
			record["issn"] = line.split(" - ", 1)[1]

		# DOI
		elif line.startswith("DO  -"):
			record["doi"] = line.split(" - ", 1)[1]

		# link to online paper
		elif line.startswith("UR  -"):
			record["url"] = line.split(" - ", 1)[1]

	record["authors"] = ", ".join(authors)
	if len(pages) > 0:
		record["pages"] = "-".join(pages)

	return record
	# return {
	# 	"id": paper_id,
	# 	"title": record["title"],
	# 	"authors": ", ".join(record["authors"]),
	# 	"venue": record["venue"], # International Journal of Software and Informatics, 4(1), 89-100.
	# 	"volume": record.get("volume", ""),
	# 	"issue": record.get("issue", ""),
	# 	"publisher": record["publisher"],
	# 	"year": record["year"],
	# 	"pages": f"{record.get('start_page', '')}-{record.get('end_page', '')}",
	# 	"link": "#",
	# }


# ----------------------------
# Read from file and parse all
# ----------------------------
def parse_ris_file(path):
	results = []
	buffer = []

	paper_id  = 1
	with open(path, "r", encoding="utf-8") as f:
		for line in f:
			if line.startswith("ER  -"):
				# End of one record
				results.append(parse_ris_record(buffer, paper_id))
				paper_id += 1
				buffer = []
			else:
				buffer.append(line)

	return results


In [27]:
data = parse_ris_file("citations.ris")
with open("papers.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)