# Summarize reports related to public health

## Import libraries

In [1]:
import json
import requests
import pandas as pd
import datetime
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#********************DON'T NEED TO RUN AGAIN*********************
#dfmetadata = pd.read_csv('MASTER-pub-health-reports-2022-07-15.csv',encoding='utf8')
#dfalt = pd.read_csv('Altmetric-report-22-08-17.csv',encoding='utf8')
#df_full = dfmetadata.merge(dfalt, how='left', on='doi').reset_index(drop=True)
#Save a file of combined information
#save_file = df_full.to_csv('ANALYSIS-pub-health-reports.csv')

# Analysis

In [2]:
#Open a file if needed
df_full = pd.read_csv('ANALYSIS-pub-health-reports.csv',encoding='utf8')
len(df_full)

226

In [3]:
post_dates = df_full['posted'].tolist()
post_dates.sort()
first_date = post_dates[0]
last_date = post_dates[-1]
print('Earliest date:',first_date,'; Most recent date:',last_date)

Earliest date: 2016-10-04T01:14:48 ; Most recent date: 2022-07-12T00:23:48


In [13]:
online_dates = df_full['firstOnline'].tolist()
online_dates.sort()
first_date = online_dates[0]
last_date = online_dates[-1]
print('Earliest date:',first_date,'; Most recent date:',last_date)

Earliest date: 2003-01-01T00:00:00 ; Most recent date: 2022-07-12T00:23:48


In [14]:
pub_dates = df_full['published_date'].tolist()
pub_dates.sort()
first_date = pub_dates[0]
last_date = pub_dates[-1]
print('Earliest date:',first_date,'; Most recent date:',last_date)

Earliest date: 2016-10-04T01:14:48Z ; Most recent date: 2022-07-12T00:23:48Z


In [19]:
#Look at item type list
df_full['defined_type_name'].value_counts()

report    226
Name: defined_type_name, dtype: int64

In [4]:
#Look at license list
df_full['license'].value_counts()

In Copyright                    59
CC BY 4.0                       56
CC BY-NC-SA 4.0                 37
CQUniversity General 1.0        25
CC BY-NC-ND 4.0                 20
CC BY-NC 4.0                     8
CC BY-ND 4.0                     8
CQUniversity Open Access 1.0     7
CC BY-SA 4.0                     3
All Rights Reserved 1.0          1
Public Domain                    1
CC BY 3.0                        1
Name: license, dtype: int64

In [5]:
#Replace the [] with NaN in the funders col. This just has linked funder names
df_full['funders'].replace('[]', np.NaN, inplace = True)


In [6]:
#Count references to at least one funder. I manually counted links to Dimentions = 29
print(df_full['funders'].count(),'records list at least one funder')

142 records list at least one funder


In [7]:
mean_files = df_full['count_files'].mean()
mean_files

1.261061946902655

In [8]:
#Number of records with 1 or fewer files
print(len(df_full[df_full['count_files'] <= 1]),'records with one or fewer files;',
      len(df_full[df_full['count_files'] == 0]),'with zero files;',
      len(df_full[df_full['count_files'] > 1]),'filesets')

      

183 records with one or fewer files; 22 with zero files; 43 filesets


In [27]:
tags = df_full['tags'].to_frame()

#Clean up the strings so each value is words separated by commas and a space
tags['tags_clean'] = tags['tags'].str.replace('[','', regex=False)
tags['tags_clean'] = tags['tags_clean'].str.replace(']','', regex=False)
tags['tags_clean'] = tags['tags_clean'].str.replace("'","", regex=False)

#Create a list to hold the list of words for each row, the tags are added to the sublists based on the ', '
tags_clean = []
for y in tags['tags_clean']:
    value = y.split(', ')
    tags_clean.append(value)

#Make the list of lists to one list:
all_tags = [word for tag in tags_clean for word in tag]
len(all_tags)

1581

In [28]:
all_tags_df = pd.DataFrame(all_tags, columns = ['tag'])
tops = all_tags_df.value_counts()

In [29]:
tops.head(20)

tag                                                       
Public Health and Health Services not elsewhere classified    34
Epidemiology                                                  28
Built Environment and Design not elsewhere classified         26
Urban Policy                                                  25
Urban Analysis and Development                                25
Urban Design                                                  25
Human Geography not elsewhere classified                      25
Health and Community Services                                 17
Health Care                                                   14
Health Information Systems (incl. Surveillance)               13
Health Care Administration                                    13
10,000 steps                                                  11
Physical activity                                             11
Community Child Health                                        11
Primary Health Care            

In [30]:
#How many records are on figshare.com, that is, don't have a group_id
test = df_full['group_id'].isna().sum()
test

0

In [9]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 62 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      226 non-null    int64  
 1   index                           226 non-null    int64  
 2   project_id                      66 non-null     float64
 3   id                              226 non-null    int64  
 4   title                           226 non-null    object 
 5   doiVersioned                    195 non-null    object 
 6   doi                             195 non-null    object 
 7   handle                          31 non-null     object 
 8   url                             226 non-null    object 
 9   published_date                  226 non-null    object 
 10  thumb                           202 non-null    object 
 11  defined_type                    226 non-null    int64  
 12  defined_type_name               226 

In [11]:
views = df_full['views'].sum()
downloads = df_full['downloads'].sum()
print(views,'views and', downloads,'downloads')

120761 views and 50830 downloads
