In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

In [2]:
metadata_df = pd.read_csv('../data/outputs/rsc_metadata_full.csv')

In [3]:
metadata_df.head()

Unnamed: 0,id,issn,title,fpage,lpage,year,volume,journal,author,type,...,period,century,pages,sentences,tokens,visualizationLink,doi,jstorLink,hasAbstract,isAbstractOf
0,rspb_1978_0019,0080-4649,"The ultrastructure, development and mode of op...",245,267,1978,200,Proceedings of the Royal Society of London. Se...,"F. Gwendolen Rees, F. R. S.",article,...,1950,1900,18,516,12065,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1978.0019,,,
1,103420,02607085,"A Description of an Aurora Borealis, Seen on t...",186,190,1720,31,Philosophical Transactions (1683-1775),Samuel Cruwys,fla,...,1700,1700,5,31,1176,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1720.0046,http://www.jstor.org/stable/103420,,
2,rsbm_1945_0009,1479-571X,"John Jacob Fox, 1874 - 1944",141,157,1945,5,Obituary Notices of Fellows of the Royal Society,Robert Robertson|John Jacob Fox,biography,...,1900,1900,2,499,9172,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rsbm.1945.0009,,,
3,110721,03655695,On the Production of Heat by Voltaic Electrici...,280,282,1837,4,Abstracts of the Papers Printed in the Philoso...,J. P. Joule,abs,...,1800,1800,3,20,653,http://corpora.clarin-d.uni-saarland.de/surpri...,,http://www.jstor.org/stable/110721,,
4,rspb_1959_0017,0080-4649,Electron microscopy of collagen-like connectiv...,233,239,1959,150,Proceedings of the Royal Society of London. Se...,"E. G. Gray|J. Z. Young, F. R. S.",article,...,1950,1900,6,186,3981,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1959.0017,,,


In [4]:
metadata_df.columns

Index(['id', 'issn', 'title', 'fpage', 'lpage', 'year', 'volume', 'journal',
       'author', 'type', 'corpusBuild', 'doiLink', 'language', 'jrnl',
       'decade', 'period', 'century', 'pages', 'sentences', 'tokens',
       'visualizationLink', 'doi', 'jstorLink', 'hasAbstract', 'isAbstractOf'],
      dtype='object')

In [5]:
metadata_df.count()

id                   47837
issn                 47837
title                47825
fpage                47837
lpage                47837
year                 47837
volume               47837
journal              47837
author               46173
type                 47837
corpusBuild          47837
doiLink              44700
language             38058
jrnl                 47837
decade               47837
period               47837
century              47837
pages                47837
sentences            47837
tokens               47837
visualizationLink    47837
doi                  44700
jstorLink             9779
hasAbstract            785
isAbstractOf           785
dtype: int64

In [7]:
type_distribution = metadata_df['type'].value_counts()
print(type_distribution)

type
article                     31305
fla                          7249
abstract                     2661
abs                          2243
biography                    1539
lecture                       522
list                          374
report                        331
brv                           273
speech                        213
errata                        182
discussion                    159
paper-read                    131
experiment                    130
book-review                   125
corrigenda                     76
preface                        52
bibliography                   45
publication-announcement       44
appendix                       30
letter                         19
notes                          17
index                          16
magnetical-observation         15
nws                            14
Article                        14
astronomical-observation       14
addendum                       13
symposium                      10
advertise

In [8]:
decade_distribution = metadata_df['decade'].value_counts()

sorted_decade_distribution = dict(sorted(decade_distribution.items()))


for year, count in sorted_decade_distribution.items():
    print(f'{year}: {count}')

1660: 308
1670: 376
1680: 268
1690: 373
1700: 355
1710: 190
1720: 295
1730: 352
1740: 494
1750: 484
1760: 408
1770: 408
1780: 300
1790: 219
1800: 518
1810: 666
1820: 286
1830: 828
1840: 476
1850: 990
1860: 1185
1870: 1403
1880: 1529
1890: 1647
1900: 1612
1910: 1437
1920: 2055
1930: 3303
1940: 1604
1950: 3656
1960: 4168
1970: 5231
1980: 5488
1990: 4925


In [45]:
sample_size = 25

samples_1650 = metadata_df[(metadata_df['period'] == 1650) & (metadata_df['type'] == 'fla')].sample(sample_size)
samples_1700 = metadata_df[(metadata_df['period'] == 1700) & (metadata_df['type'] == 'fla')].sample(sample_size)
samples_1750 = metadata_df[(metadata_df['period'] == 1750) & (metadata_df['type'] == 'fla')].sample(sample_size)
samples_1800 = metadata_df[(metadata_df['period'] == 1800) & (metadata_df['type'] == 'fla')].sample(sample_size)
samples_1850 = metadata_df[(metadata_df['period'] == 1850) & (metadata_df['type'] == 'article')].sample(sample_size)
samples_1900 = metadata_df[(metadata_df['period'] == 1900) & (metadata_df['type'] == 'article') & (metadata_df['year'] < 1924)].sample(sample_size)

samples = pd.concat([samples_1650, samples_1700, samples_1750, samples_1800, samples_1850, samples_1900])
samples


Unnamed: 0,id,issn,title,fpage,lpage,year,volume,journal,author,type,...,period,century,pages,sentences,tokens,visualizationLink,doi,jstorLink,hasAbstract,isAbstractOf
4324,100925,03702316,"Observations of a New Comet, Made at Paris in ...",4042,4050,1672,7,Philosophical Transactions (1665-1678),Signor Cassini,fla,...,1650,1600,9,155,4098,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1672.0015,http://www.jstor.org/stable/100925,,
44181,101149,03702316,A Continuation of the Discourse Concerning Vit...,66,73,1674,9,Philosophical Transactions (1665-1678),,fla,...,1650,1600,8,54,3381,http://corpora.clarin-d.uni-saarland.de/surpri...,,http://www.jstor.org/stable/101149,,
19972,102447,02607085,Dr. Gwither,118,120,1694,18,Philosophical Transactions (1683-1775),Dr. Gwither|Owen Lloyd,fla,...,1650,1600,3,12,953,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1694.0027,http://www.jstor.org/stable/102447,,
8265,101604,03702316,"A Narrative of a Monstrous Birth in Plymouth, ...",2096,2098,1670,5,Philosophical Transactions (1665-1678),William Durfton,fla,...,1650,1600,4,25,964,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1670.0066,http://www.jstor.org/stable/101604,,
33905,101359,03702316,An Extract of a Letter Written by Mr. John Tem...,6066,6067,1673,8,Philosophical Transactions (1665-1678),John Templer,fla,...,1650,1600,2,14,377,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rstl.1673.0028,http://www.jstor.org/stable/101359,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20988,rspb_1923_0038,0950-1193,A comparison between certain features of the s...,299,339,1923,95,Proceedings of the Royal Society of London. Se...,"E. G. T. Liddell, M. A. Oxon| Sir C. S. Sherri...",article,...,1900,1900,34,839,15667,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1923.0038,,,
14249,rspb_1905_0055,0950-1193,The effect of plant growth and of manures upon...,1,32,1905,77,Proceedings of the Royal Society of London. Se...,"A. D. Hall, M. A. |N. H. J. Miller, Ph. D.|H. ...",article,...,1900,1900,31,565,14689,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1905.0055,,,
33903,rspa_1920_0053,0950-1207,Reduction of error by linear compounding.,64,66,1920,98,Proceedings of the Royal Society of London. Se...,"W. F. Sheppard, Sc. D., LL. M.|Prof. E. T. Whi...",article,...,1900,1900,2,46,1381,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspa.1920.0053,,,
31649,rspb_1911_0074,0950-1193,Fractional withdrawal of complement and amboce...,277,279,1911,84,Proceedings of the Royal Society of London. Se...,"J. O. Wakelin Barratt, M. D. D. Sc. Lond.|Prof...",article,...,1900,1900,1,39,1365,http://corpora.clarin-d.uni-saarland.de/surpri...,10.1098/rspb.1911.0074,,,


In [46]:
samples.columns

Index(['id', 'issn', 'title', 'fpage', 'lpage', 'year', 'volume', 'journal',
       'author', 'type', 'corpusBuild', 'doiLink', 'language', 'jrnl',
       'decade', 'period', 'century', 'pages', 'sentences', 'tokens',
       'visualizationLink', 'doi', 'jstorLink', 'hasAbstract', 'isAbstractOf'],
      dtype='object')

In [49]:
sample_2 = samples.loc[:, ['id', 'title', 'author', 'year', 'decade', 'period', 'type', 'doi', 'jstorLink']]


In [51]:
sample_2

Unnamed: 0,id,title,author,year,decade,period,type,doi,jstorLink
4324,100925,"Observations of a New Comet, Made at Paris in ...",Signor Cassini,1672,1670,1650,fla,10.1098/rstl.1672.0015,http://www.jstor.org/stable/100925
44181,101149,A Continuation of the Discourse Concerning Vit...,,1674,1670,1650,fla,,http://www.jstor.org/stable/101149
19972,102447,Dr. Gwither,Dr. Gwither|Owen Lloyd,1694,1690,1650,fla,10.1098/rstl.1694.0027,http://www.jstor.org/stable/102447
8265,101604,"A Narrative of a Monstrous Birth in Plymouth, ...",William Durfton,1670,1670,1650,fla,10.1098/rstl.1670.0066,http://www.jstor.org/stable/101604
33905,101359,An Extract of a Letter Written by Mr. John Tem...,John Templer,1673,1670,1650,fla,10.1098/rstl.1673.0028,http://www.jstor.org/stable/101359
...,...,...,...,...,...,...,...,...,...
20988,rspb_1923_0038,A comparison between certain features of the s...,"E. G. T. Liddell, M. A. Oxon| Sir C. S. Sherri...",1923,1920,1900,article,10.1098/rspb.1923.0038,
14249,rspb_1905_0055,The effect of plant growth and of manures upon...,"A. D. Hall, M. A. |N. H. J. Miller, Ph. D.|H. ...",1905,1900,1900,article,10.1098/rspb.1905.0055,
33903,rspa_1920_0053,Reduction of error by linear compounding.,"W. F. Sheppard, Sc. D., LL. M.|Prof. E. T. Whi...",1920,1920,1900,article,10.1098/rspa.1920.0053,
31649,rspb_1911_0074,Fractional withdrawal of complement and amboce...,"J. O. Wakelin Barratt, M. D. D. Sc. Lond.|Prof...",1911,1910,1900,article,10.1098/rspb.1911.0074,


In [52]:
sample_2

Unnamed: 0,id,title,author,year,decade,period,type,doi,jstorLink
6588,108155,Contributions to the Chemical History of Palla...,Robert Kane,1842,1840,1800,fla,10.1098/rstl.1842.0016,http://www.jstor.org/stable/108155
21677,102244,An Exact Account of the Three Late Conjunction...,J. F. Astron,1683,1680,1650,fla,10.1098/rstl.1683.0036,http://www.jstor.org/stable/102244
12119,101869,"An Extract of a Letter Written from Rome, Date...",,1686,1680,1650,fla,,http://www.jstor.org/stable/101869
12688,rspl_1897_0106,On a method of determining the reactions at th...,"George Wilson, M. Sc.|Professor Osborne Reynol...",1897,1890,1850,article,10.1098/rspl.1897.0106,
26476,107934,Researches in Physical Astronomy,J. W. Lubbock,1831,1830,1800,fla,10.1098/rstl.1831.0016,http://www.jstor.org/stable/107934
30424,106348,"Experiments on Electricity, Being an Attempt t...",Edward Nairne,1778,1770,1750,fla,10.1098/rstl.1778.0038,http://www.jstor.org/stable/106348
45751,rspb_1913_0018,The liberation of ions and the oxygen tension ...,"H. E. Roaf, M. D., D. Sc.|Prof. C. S. Sherring...",1913,1910,1900,article,10.1098/rspb.1913.0018,
23233,107056,Investigation of the Powers of the Prismatic C...,William Herschel,1800,1800,1800,fla,10.1098/rstl.1800.0014,http://www.jstor.org/stable/107056
13910,101502,Some Observations Lately Made at London Concer...,Mr. Hook,1665,1660,1650,fla,10.1098/rstl.1665.0103,http://www.jstor.org/stable/101502
45901,107943,Further Experiments with a New Register-Pyrome...,J. Frederick Daniell,1831,1830,1800,fla,10.1098/rstl.1831.0025,http://www.jstor.org/stable/107943


In [55]:
sample_2.sample(30).to_csv('sample_30_articles.tsv', sep='\t', index=False)
