# Set Up

In [None]:
from xml.etree import ElementTree
import codecs
import glob
from lxml import etree

import random
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
folder = '/data/raw/reviews/'
all_files = glob.glob(folder + "*.rm5")

# Included/Excluded Studies

In [None]:
num_included_studies = []
num_excluded_studies = []
for file_ in all_files: #over 7k files
    root = etree.parse(file_)
    included_studies = root.xpath('/COCHRANE_REVIEW/CHARACTERISTICS_OF_STUDIES/CHARACTERISTICS_OF_INCLUDED_STUDIES/INCLUDED_CHAR')
    excluded_studies = root.xpath('/COCHRANE_REVIEW/CHARACTERISTICS_OF_STUDIES/CHARACTERISTICS_OF_EXCLUDED_STUDIES/EXCLUDED_CHAR')
    num_included_studies.append(len(included_studies))
    num_excluded_studies.append(len(excluded_studies))

In [None]:
fig = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.hist(num_included_studies)
plt.title("Number of included studies in a review", fontsize=20)
plt.subplot(122)
plt.hist(num_excluded_studies)
plt.title("Number of excluded studies in a review", fontsize=20);

In [None]:
print("Average number of included studies {0:.2f}".format(np.mean(np.array(num_included_studies))))
print("Median number of included studies {}".format(np.median(np.array(num_included_studies))))
print("Average number of excluded studies {0:.2f}".format(np.mean(np.array(num_excluded_studies))))
print("Median number of excluded studies {}".format(np.median(np.array(num_excluded_studies))))

In [None]:
for i in [0, 1,2,3,4,5]:
    num_reviews = np.sum(np.array(num_included_studies) <= i)
    print("Number of reviews with {} or less studies included: {}". format(i, num_reviews))

For example, `Interventions for HIV-associated nephropathy` has no included studies but it does have excluded studies and ongoing studies. What do we do with this one?

# Mesh terms and keywords

Why are there no mesh terms or keywords for any of these reviews? Is this not the right place to access them? 

Do PICO terms live here anywhere?

In [None]:
all_mesh = []
all_key = []
num_mesh = 0
num_key = 0
for file_ in all_files:
    root = etree.parse(file_)
    mesh_terms = root.xpath('COCHRANE_REVIEW/COVER_SHEET/MESH_TERMS')
    keywords = root.xpath('COCHRANE_REVIEW/COVER_SHEET/KEYWORDS')
    if len(mesh_terms) > 0:
        num_mesh += 1
    if len(keywords) > 0:
        num_key += 1
    all_mesh.append(mesh_terms)
    all_key.append(keywords)

In [None]:
print("Number of reviews for which there is at least one mesh term {} and at least one keyword {}."
      .format(num_mesh, num_key))

---