# Statistics of the SYNERGY dataset

This notebook shows basic statistics of the synergy dataset. PRs to improve this notebook and add more general descriptives are very welcome. 

In [11]:
from pprint import pprint

import pandas as pd

from synergy_dataset import Dataset, iter_datasets

In [12]:
df_synergy = pd.concat([d.to_frame() for d in iter_datasets()])

In [13]:
df_synergy

Unnamed: 0_level_0,doi,title,abstract,label_included
openalex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://openalex.org/W2402226166,,Clinical aspects of Wilson's disease.,,0
https://openalex.org/W2416808666,,[Hepatic changes in the neurological form of W...,,0
https://openalex.org/W2060266518,https://doi.org/10.1016/s1367-5931(03)00018-8,Copper in medicine,Copper has been found to be causative in sever...,0
https://openalex.org/W2412464989,https://doi.org/10.5694/j.1326-5377.1970.tb634...,HEPATOLENTICULAR DEGENERATION (WILSON'S DISEAS...,,0
https://openalex.org/W66518910,,[Lupus erythematosus due to penicillamine asso...,,0
...,...,...,...,...
https://openalex.org/W1995096921,https://doi.org/10.1016/j.injury.2012.07.007,A kick in the shins: The financial impact of u...,Warfarin is increasingly prescribed in the eld...,0
https://openalex.org/W1504586396,,Use of aspirin in cardiovascular prophylaxis.,The value of prophylatic low-dose aspirin in p...,0
https://openalex.org/W2114922669,https://doi.org/10.1177/1076029615598222,Home Treatment of Deep Venous Thrombosis in th...,This is a retrospective cohort study of adults...,0
https://openalex.org/W1657669688,https://doi.org/10.1111/j.1553-2712.2010.00976.x,Prior Statin Use Is Not Associated With Improv...,The objective was to determine whether prior s...,0


## Total number of records and duplicates

In [14]:
df_synergy.index.duplicated().sum()

np.int64(3203)

In [15]:
df_synergy.index.duplicated().sum()/len(df_synergy)*100

np.float64(1.8920419639903596)

## Vocabulary size


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
result = []

for d in iter_datasets():

    # unigrams
    v_title_uni = CountVectorizer().fit_transform(d.to_frame()['title'].fillna(""))
    v_abstract_uni = CountVectorizer().fit_transform(d.to_frame()['abstract'].fillna(""))

    # bigrams
    v_title_bi = CountVectorizer(ngram_range=(2, 2)).fit_transform(d.to_frame()['title'].fillna(""))
    v_abstract_bi = CountVectorizer(ngram_range=(2, 2)).fit_transform(d.to_frame()['abstract'].fillna(""))

    # unigrams and bigrams
    v_title_both = CountVectorizer(ngram_range=(1, 2)).fit_transform(d.to_frame()['title'].fillna(""))
    v_abstract_both = CountVectorizer(ngram_range=(1, 2)).fit_transform(d.to_frame()['abstract'].fillna(""))

    result.append(
        {
            'name': d.name,
            'n title unigrams': v_title_uni.shape[1],
            'n abstract unigrams': v_abstract_uni.shape[1],
            'n title bigrams': v_title_bi.shape[1],
            'n abstract bigrams': v_abstract_bi.shape[1],
            'n title uni+bigrams': v_title_both.shape[1],
            'n abstract uni+bigrams': v_abstract_both.shape[1],
        }
    )

df = pd.DataFrame(result)
df

Unnamed: 0,name,n title unigrams,n abstract unigrams,n title bigrams,n abstract bigrams,n title uni+bigrams,n abstract uni+bigrams
0,Appenzeller-Herzog_2019,4320,18624,12269,150220,16589,168844
1,Bos_2018,5378,22216,22957,292868,28335,315084
2,Brouwer_2019,19817,80450,139673,1572216,159490,1652666
3,Chou_2003,3104,14562,10228,135307,13332,149869
4,Chou_2004,3058,13857,8204,116201,11262,130058
5,Donners_2021,918,6272,1959,38985,2877,45257
6,Hall_2012,7739,23875,40531,356937,48270,380812
7,Jeyaraman_2020,2243,10965,7646,107386,9889,118351
8,Leenaars_2019,6961,24175,32539,326445,39500,350620
9,Leenaars_2020,7114,28181,30448,365262,37562,393443
