In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

In [2]:
import pickle

In [3]:
from typing import List, Dict, Any, Tuple

In [4]:
import glob
from tqdm.notebook import tqdm
import json

In [5]:
from datasets import Dataset

In [6]:
# read articles from desk
text_path = 'data/outputs/article_texts_full/' 

# get all files in the path 
text_files: List[str] = glob.glob(text_path + '*.txt')

documents: List[str] = []
document_IDs: List[str] = []

for text_path in tqdm(text_files):
    with open(text_path, 'r') as f:
        document_IDs.append(text_path.split('/')[-1].split('.')[0])

        article = f.read()
        documents.append(article)


print(f"Number of articles: {len(documents)}")


  0%|          | 0/47837 [00:00<?, ?it/s]

Number of articles: 47837


In [7]:
metadata_df = pd.read_csv('data/outputs/rsc_metadata_full.csv')
metadata_df.head()

Unnamed: 0,id,issn,title,fpage,lpage,year,volume,journal,author,type,...,period,century,pages,sentences,tokens,visualizationLink,doi,jstorLink,hasAbstract,isAbstractOf
0,rspb_1978_0019,0080-4649,"The ultrastructure, development and mode of op...",245,267,1978,200,Proceedings of the Royal Society of London. Se...,"F. Gwendolen Rees, F. R. S.",article,...,1950,1900,18,516,12065,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1978.0019,,,
1,103420,02607085,"A Description of an Aurora Borealis, Seen on t...",186,190,1720,31,Philosophical Transactions (1683-1775),Samuel Cruwys,fla,...,1700,1700,5,31,1176,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1720.0046,http://www.jstor.org/stable/103420,,
2,rsbm_1945_0009,1479-571X,"John Jacob Fox, 1874 - 1944",141,157,1945,5,Obituary Notices of Fellows of the Royal Society,Robert Robertson|John Jacob Fox,biography,...,1900,1900,2,499,9172,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rsbm.1945.0009,,,
3,110721,03655695,On the Production of Heat by Voltaic Electrici...,280,282,1837,4,Abstracts of the Papers Printed in the Philoso...,J. P. Joule,abs,...,1800,1800,3,20,653,http://corpora.clarin-d.uni-saarland.de/surpri...,,http://www.jstor.org/stable/110721,,
4,rspb_1959_0017,0080-4649,Electron microscopy of collagen-like connectiv...,233,239,1959,150,Proceedings of the Royal Society of London. Se...,"E. G. Gray|J. Z. Young, F. R. S.",article,...,1950,1900,6,186,3981,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1959.0017,,,


In [8]:
# get document titles from metadata

document_titles: List[str] = []

# TODO: this is very slow, should be optimized
for doc_id in tqdm(document_IDs):

    # add document text to the metadata
    metadata_df.loc[metadata_df['id'] == doc_id, 'text'] = documents[document_IDs.index(doc_id)]

  0%|          | 0/47837 [00:00<?, ?it/s]

In [9]:

# convert to HuggingFace dataset
hf_dataset = Dataset.from_pandas(metadata_df)


In [10]:
hf_dataset

Dataset({
    features: ['id', 'issn', 'title', 'fpage', 'lpage', 'year', 'volume', 'journal', 'author', 'type', 'corpusBuild', 'doiLink', 'language', 'jrnl', 'decade', 'period', 'century', 'pages', 'sentences', 'tokens', 'visualizationLink', 'doi', 'jstorLink', 'hasAbstract', 'isAbstractOf', 'text'],
    num_rows: 47837
})

In [11]:
hf_dataset.save_to_disk('data/outputs/hf_dataset_full')

Saving the dataset (0/3 shards):   0%|          | 0/47837 [00:00<?, ? examples/s]

In [12]:
# push to HuggingFace dataset hub
hf_dataset.push_to_hub('rsc_articles_full', private=True)

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/badrex/rsc_articles_full/commit/b22e19d70a589df106ec1998deaaaa009adfb314', commit_message='Upload dataset', commit_description='', oid='b22e19d70a589df106ec1998deaaaa009adfb314', pr_url=None, pr_revision=None, pr_num=None)