<h1><center>GOSS Export Analysis</center></h1>

In [None]:
from IPython.display import HTML
################################
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<b><a href="javascript:code_toggle()">:)</a></b>''')


Firstly import the required python packages.

In [None]:
import pandas as pd
import numpy as np
import plotly.offline as pyoff
import plotly.graph_objs as go
from __future__ import division
import json
import datetime
pyoff.init_notebook_mode()

Import the json object as a pandas dataframe.

In [None]:
file_name = 'nhs_useful_notebooks/hscicweb320180214.json'
json_file = open(file_name)
json_file_complete = json.load(json_file)
articles = pd.io.json.json_normalize(json_file_complete['articles'])
media = pd.DataFrame(json_file_complete['media'])
links = pd.DataFrame(json_file_complete['links'])

####  First 5 rows of the article data.

In [None]:
articles.head()

#### How many pages are in the GOSS export?

In [None]:
print('''There are {} pages in the GOSS export.'''.format(len(articles)))

#### What is the percentage of missing values for each column?

In [None]:
def apply_null(value):
    if value == []:
        return None
    else:
        return value
    
for column in articles.columns:
    articles[column] = articles[column].apply(apply_null)

In [None]:
null_df = pd.DataFrame(articles.isnull().sum(axis = 0).sort_values(ascending =False), columns = ['missing_values'])
null_df['missing_percentage'] = (null_df['missing_values']/articles.shape[0])*100            
text_hover = ['{} : {}%'.format(index,round(column,2)) for index, column in null_df['missing_percentage'].iteritems()]
data = go.Bar(
    x=list(null_df.index),
    y=list(null_df['missing_percentage']),
    text = text_hover,
    hoverinfo = 'text'
)
data = [data]
layout_nulls = go.Layout(
    title = 'Percentage of Missing Values by column',
    xaxis = dict(title = 'Page Type'),
    yaxis = dict(title = 'Count',
                 exponentformat='none',
                 showexponent='all'),
)
fig_nulls = dict(data=data, layout=layout_nulls)
pyoff.iplot(fig_nulls)

#### Number of pages by Template ID

In [None]:
green = 'rgb(102,170,85)'
blue = 'rgb(51,102,170)'

list_of_template_types_1 = ['Default','Authentication', 'Forms Service','List','Redirect','Home','Blog Entry','Landing','Blog List','Faceted Search','Publication', 'Publication List']
list_of_template_types_2 = ['Default','Authentication', 'Forms Service','List','Redirect','Home','Blog Entry','Landing','Blog List','Publication', 'Publication List']

list_of_template_colors = [green, green, green, green, green, green, green, green, green, green, green, green]

In [None]:
groupby_template = articles.groupby('ARTICLETEMPLATEID').count()
text_hover = ['Template ID: {}<br>Count: {}'.format(index,column) for index, column in groupby_template['ARTICLEDATE'].iteritems()]
first_data = groupby_template['ARTICLEDATE']

articles['end_date'] = articles['ARTICLEDISPLAYEDATE'].str[:-6].apply(datetime.datetime.strptime, args = ['%B, %d %Y %H:%M:%S'])
today = datetime.datetime.now()
articles_displayed = articles[(articles['ARTICLEDISPLAY'] == 'on')\
                       &(articles['end_date'] > today)\
                       &(articles['STATUS'] == 'Live')]
second_data = articles_displayed.groupby('ARTICLETEMPLATEID').count()['ARTICLEDATE']
text_hover_2 = ['Template ID: {}<br>Count: {}'.format(index,column) for index, column in second_data.iteritems()]

data = go.Bar(
    x=list_of_template_types_1,
    y=first_data,
    text = text_hover,
    hoverinfo = 'text',
    marker = dict(color = list_of_template_colors),
    name = 'All Articles'
)

data2 = go.Bar(
    x=list_of_template_types_2,
    y=second_data,
    text = text_hover_2,
    hoverinfo = 'text',
    marker = dict(color = blue),
    name = 'Live Articles'
)
 
data = [data, data2]
layout_template = go.Layout(
    title = 'Number of Articles by Template Type',
    xaxis = dict(title = 'Template Type'),
    yaxis = dict(title = 'Count',
                 exponentformat='none',
                 showexponent='all')
)
fig_template = dict(data=data, layout=layout_template)
pyoff.iplot(fig_template)

#### Which articles contain media? 

In [None]:
articles_displayed.loc[articles_displayed['Media'].isnull(), 'medialen'] = 'No'
articles_displayed.loc[articles_displayed['Media'].str.len() > 0, 'medialen'] = 'Yes'
gb_media = articles_displayed.groupby('medialen').count()
text_hover = ['Media : {}<br>Count : {}'.format(index,column) for index, column in gb_media['ARTICLEDATE'].iteritems()]
data = go.Bar(
    x=['No','Yes'],
    y=list(gb_media['ARTICLEDATE']),
    text = text_hover,
    hoverinfo = 'text'
)
data = [data]
layout_media = go.Layout(
    title = 'Number of Articles that Contain Media',
    xaxis = dict(title = 'Does article contain media?'),
    yaxis = dict(title = 'Count',
                 exponentformat='none',
                 showexponent='all'),
)
fig_media = dict(data=data, layout=layout_media)
pyoff.iplot(fig_media)

#### Media Data Analysis. 

In [None]:
media.head()

In [None]:
def get_media_len(files):
    return len(files)

In [None]:
media['num_files'] = media['Files'].apply(get_media_len)

#### How many media items does each row have in the 'Files' column?

In [None]:
pd.DataFrame(media.groupby('num_files').count()['DESCRIPTION'])

Below are some examples where more than one variant.

In [None]:
media[media['num_files'] >1].head()

In [None]:
def get_media_file(files):
    return files[files.keys()[0]]
media['file_name'] = media['Files'].apply(get_media_file)
media[media['num_files']>1][['MEDIAID','Files','file_name']].head()

####  What are the media types in the GOSS Export?

In [None]:
def get_media_type(files):
    return files[files.keys()[0]].split('.')[-1]
media['media_type'] = media['Files'].apply(get_media_type)

In [None]:
gb_media_type = media.groupby('media_type').count()
text_hover = ['Media : {}<br>Count : {}'.format(index,column) for index, column in gb_media_type['TITLE'].iteritems()]
data = go.Bar(
    x=gb_media_type.index,
    y=list(gb_media_type['TITLE']),
    text = text_hover,
    hoverinfo = 'text'
)
data = [data]
layout_media = go.Layout(
    title = 'Media Types',
    xaxis = dict(title = 'Media Type'),
    yaxis = dict(title = 'Count',
                 exponentformat='none',
                 showexponent='all'),
)
fig_media = dict(data=data, layout=layout_media)
pyoff.iplot(fig_media)

### Examining Topics and Sub Topics - Metadata

As shown below, 2866 of the articles contain metadata. It is suspected these are mostly publications. It is shown below that 2611 of these are publications.

In [None]:
def extract_topics(metadata, metadata_type):
    if metadata is not None:
        list_of_topics = []
        for item in metadata:
            if item['GROUP'] == metadata_type:
                list_of_topics.append(item['VALUE'])
        if len(list_of_topics) > 0:
            return list_of_topics
        else:
            return None
        
def get_len(field):
    if field is not None:
        return len(field)

Number of Articles with Metadata

In [None]:
len(articles_displayed[articles_displayed['Metadata'].isnull() == False])

Number of Publications with Metadata

In [None]:
len(articles_displayed[(articles_displayed['Metadata'].isnull() == False)&(articles_displayed['ARTICLETEMPLATEID'] == 23)])

In [None]:
metadata = articles[articles['Metadata'].isnull() == False]
metadata_group = pd.DataFrame([item for sublist in list(metadata['Metadata']) for item in sublist])

Below the counts of each value in the GROUP field in the metadata is shown. There are 6038 Topics and 3587 Sub-Topics, meaning that some articles must contain multiple of these fields.

In [None]:
metadata_group.groupby('GROUP').count()['VALUE']

In [None]:
articles['topics'] = articles['Metadata'].apply(extract_topics, args = ['Topic'])
articles['sub-topics'] = articles['Metadata'].apply(extract_topics, args = ['Sub-Topic'])
articles['num_topics'] = articles['topics'].apply(get_len)
articles['num_subtopics'] = articles['sub-topics'].apply(get_len)

Below are two sections, showing the counts of the number of topics and subtopics associated with articles.

#### Topics 

In [None]:
print('''Number of Topics: {}'''.format(len(metadata_group[metadata_group['GROUP'] == 'Topic'].groupby('VALUE').count())))

List of the topics and the number of times they appear in the data.

In [None]:
metadata_group[metadata_group['GROUP'] == 'Topic'].groupby('VALUE').count()['GROUP']

#### How many topics per article?

In [None]:
topics = pd.DataFrame(articles.groupby('num_topics').count()['ARCHIVEDATE'])
topics.columns = ['Count']
topics

Here is a list of some of the articles with 48 topics.

In [None]:
articles[articles['num_topics'] == 48.0].head()

#### Sub-Topics

In [None]:
print('There are {} sub topics.'.format(len(metadata_group[metadata_group['GROUP'] == 'Sub-Topic'].groupby('VALUE').count())))

In [None]:
metadata_group[metadata_group['GROUP'] == 'Sub-Topic'].groupby('VALUE').count()

In [None]:
subtopics = pd.DataFrame(articles.groupby('num_subtopics').count()['ARCHIVEDATE'])
subtopics.columns = ['Count']
subtopics

### GOSS Redirects 

This section will examine the redirects in GOSS.

In [None]:
print('''Number of redirect articles: {}'''.format(len(articles_displayed[(articles_displayed['ARTICLETEMPLATEID'] == 13)\
                                                                          &(articles_displayed['ARTICLEDISPLAY'] == 'on')])))

Example of a redirect address.

In [None]:
articles_displayed[articles_displayed['ARTICLETEMPLATEID'] == 13].loc[121]['Links']

In [None]:
articles[articles['ARTICLEHEADING'].str.contains('Commissioning Data Sets')]

## Examining the 'Extra' part of the metadata (for list pages). 

In [None]:
list_pages = articles_displayed[articles_displayed['ARTICLETEMPLATEID'] == 7]

In [None]:
print('''Number of List Pages: {}'''.format(len(list_pages)))

In [None]:
list_pages.groupby('Extra.SHOWHIDEANCHORS').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDERELATEDFEATURES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDECHILDARTICLES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDERELATEDARTICLES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDEMETADATAARTICLES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDEMETADATAMEDIA').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDETEMPLATES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.INCLUDEMEDIATYPES').count()['ARTICLEDATE']

In [None]:
list_pages.groupby('Extra.COLUMNS').count()['ARTICLEDATE']

### Articles Soon to Expire

In [None]:
articles_displayed.loc[:,'year_expiry'] = articles_displayed.loc[:, 'ARTICLEDISPLAYEDATE'].str[-19:-15]

In [None]:
articles_displayed.groupby('year_expiry').count()['ARCHIVEDATE']

In [None]:
articles_displayed[articles_displayed['year_expiry'] == '2020']['ARTICLETEMPLATEID']

In [None]:
articles_displayed[articles_displayed['year_expiry'] == '2020']

### Articles Start Date