In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Untagged data

These data were written out from the clean_content.py script where the taxons column was empty. 
- Here we assume the taxon column was empty because the content item has not been tagged.

In [2]:
#read in untagged content to describe content with no taxons
untagged = pd.read_csv('../../data/untagged_content.csv')

In [3]:
print("There are {} rows in the untagged content data".
      format(untagged.shape[0]))
print("There are {} unique content items in the untagged content data".
      format(untagged.content_id.nunique()))

There are 57337 rows in the untagged content data
There are 57123 unique content items in the untagged content data


In [4]:
untagged.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'description', 'details',
       'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'taxons', 'title',
       'body'],
      dtype='object')

In [5]:
#convert string dates to timestamp for time series analyses (see below)
print(type(untagged['first_published_at'][0]))
untagged['first_published_at'] = pd.to_datetime(untagged['first_published_at'])
print(type(untagged['first_published_at'][0]))

<class 'str'>
<class 'pandas._libs.tslib.Timestamp'>


In [6]:
#use timestamp as index in untagged data for plots
untagged.index = untagged['first_published_at'] 

# Taxon data

Taxons data is a row for each taxon with columns for the taxon_id/taxon title at each level. So, for example, if an item has only been tagged to level1 then level2 and subsequent levels will be missing. If an item was tagged to level3, the level2 and level1 columns have been filled recursively. 

A taxon in taxons is identified through content_id

In [306]:
#read in taxon file which was cleaned from raw using clean_taxons.py
taxons = pd.read_csv('../../data/clean_taxons.csv')

In [307]:

# Convert nans to none

# taxons['level1taxon'] = taxons['level1taxon'].where(taxons['level1taxon'] != 'nan', None)
# taxons['level2taxon'] = taxons['level2taxon'].where(taxons['level2taxon'].notnull(), None)

# print("Taxons shape after converting nans to Nones: {}".format(taxons.shape))

In [308]:
taxons.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'taxon_name', 'level1',
       'level2tax_id', 'level3tax_id', 'level4tax_id', 'level1taxon',
       'level2taxon', 'level3taxon', 'level4taxon'],
      dtype='object')

In [309]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']].copy()

In [310]:
taxons

Unnamed: 0,base_path,content_id,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon
0,/business/taxon,495afdb6-47be-4df1-8b38-91c8adb1eefc,Business,,,,
1,/corporate-information,a544d48b-1e9e-47fb-b427-7a987c658c14,Corporate information,,,,
2,/crime-justice-and-law,ba951b09-5146-43be-87af-44075eac3ae9,"Crime, justice and law",,,,
3,/defence,e491505c-77ae-45b2-84be-8c94b94f6a2b,Defence,,,,
4,/childcare-parenting/entering-staying-uk,ba3a9702-da22-487f-86c1-8334a730e559,Entering and staying in the UK,,,,
5,/environment,3cf97f69-84de-41ae-bc7b-7e2cc238fa58,Environment,,,,
6,/going-and-being-abroad,9597c30a-605a-4e36-8bc1-47e5cdae41b3,Going and being abroad,,,,
7,/government/taxon,e48ab80a-de80-4e83-bf59-26316856a5f9,Government,,,,
8,/health-and-social-care,8124ead8-8ebc-4faf-88ad-dd5cbcc92ba8,Health and social care,,,,
9,/housing-local-and-community,4794066e-e3cc-425e-8cc4-e7ff3edb4c39,"Housing, local and community",,,,


In [294]:
#For top taxons (level1) ensure that taxon)name is in level1taxon column instead of Nan
taxons['level1taxon'] = taxons['level1taxon'].fillna(taxons['taxon_name'])



In [295]:
import functools
def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)

In [298]:
mask = taxons['level1taxon'] != taxons['taxon_name'], taxons['level2taxon'].isna()
taxons['level2taxon'] = taxons['level2taxon'][conjunction(mask[0], mask[1])].fillna(taxons['taxon_name'])
#taxons['level2taxon'] = taxons['level2taxon'][l1_taxnam & l2_miss].fillna(taxons['taxon_name'])
taxons[35:300]

Unnamed: 0,base_path,content_id,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon
35,/imported-topic/topic/environmental-management...,1756186f-3c7a-4112-a7ad-52a369baa01f,Hydropower,Business,Hydropower,,
36,/imported-topic/topic/environmental-management...,75ba781f-1bbb-4b11-89db-aad08e9a9259,Drought and water availability,Business,Drought and water availability,,
37,/imported-topic/topic/environmental-management...,3695b5b0-f11c-49d1-bb56-01c730c86347,Water quality,Business,Water quality,,
38,/imported-topic/topic/environmental-management...,9a53f7cc-effc-4058-829b-6b86ac1650d9,"Registers, maps and data",Business,"Registers, maps and data",,
39,/imported-topic/topic/environmental-management...,25789f03-2cd1-4aad-93ee-23886e70c8d6,Water pollution,Business,Water pollution,,
40,/imported-topic/topic/environmental-management...,42c1a780-a448-4653-9892-68e3981a000f,Impound (store) water,Business,Impound (store) water,,
41,/imported-topic/topic/environmental-management...,080e8f8f-2504-4770-bce8-961b0496aa01,Discharge water,Business,Discharge water,,
42,/imported-topic/topic/environmental-management...,8e16dfc3-74b1-41a1-a9bb-219155bccf78,Abstract (take) water,Business,Abstract (take) water,,
43,/business/export-finance,f0a12803-b660-4843-a2d2-e87d1081c914,Export finance,Business,Export finance,,
44,/business/importing-exporting,598b403a-b16d-470e-acca-74b7e1d2ed3f,Trade restrictions on exports,Business,Trade restrictions on exports,,


In [299]:
mask = taxons['level2taxon'] != taxons['taxon_name'], taxons['level3taxon'].isna(), taxons['level2taxon'].notnull()
taxons['level3taxon'] = taxons['level3taxon'][conjunction(mask[0], mask[1], mask[2])].fillna(taxons['taxon_name'])
#taxons['level3taxon'] = taxons['level3taxon'][l2_fill & l3_miss & l2_taxnam].fillna(taxons['taxon_name'])
taxons[30:200]

Unnamed: 0,base_path,content_id,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon
30,/business/financial-services,d7e02519-b7ba-4297-80a8-01cd0e5f2cb2,Financial services,Business,,,
31,/business/business-and-the-environment,13fb7519-c48e-4577-90f8-aaf483fa0c03,Business and the environment,Business,,,
32,/business/tourism,96c479bc-9b6c-49d3-8e86-3821f43a6fb0,Tourism,Business,,,
33,/business/postal-service-reform,568b53ae-bfb3-4e8b-8f02-cf333af97edd,Postal service reform,Business,,,
34,/business/industrial-strategy,74f7449e-08f8-4325-b8db-3703cb99f4d0,Industrial strategy,Business,,,
35,/imported-topic/topic/environmental-management...,1756186f-3c7a-4112-a7ad-52a369baa01f,Hydropower,Business,Hydropower,,
36,/imported-topic/topic/environmental-management...,75ba781f-1bbb-4b11-89db-aad08e9a9259,Drought and water availability,Business,Drought and water availability,,
37,/imported-topic/topic/environmental-management...,3695b5b0-f11c-49d1-bb56-01c730c86347,Water quality,Business,Water quality,,
38,/imported-topic/topic/environmental-management...,9a53f7cc-effc-4058-829b-6b86ac1650d9,"Registers, maps and data",Business,"Registers, maps and data",,
39,/imported-topic/topic/environmental-management...,25789f03-2cd1-4aad-93ee-23886e70c8d6,Water pollution,Business,Water pollution,,


In [300]:
mask = taxons['level3taxon'] != taxons['taxon_name'], taxons['level2taxon'] != taxons['taxon_name'], taxons['level4taxon'].isna(), taxons['level3taxon'].notnull()
taxons['level4taxon'] = taxons['level4taxon'][conjunction(mask[0], mask[1], mask[2], mask[3])].fillna(taxons['taxon_name'])
#taxons['level4taxon'] = taxons['level4taxon'][l3_fill & l4_miss & l3_taxnam].fillna(taxons['taxon_name'])

In [304]:

mask = taxons['level4taxon'] != taxons['taxon_name'], taxons['level3taxon'] != taxons['taxon_name'], taxons['level2taxon'] != taxons['taxon_name'], taxons['level4taxon'].notnull()
taxons['level5taxon'] = np.nan

taxons['level5taxon'] = taxons['level5taxon'][conjunction(mask[0], mask[1], mask[2], mask[3])].fillna(taxons['taxon_name'])



In [305]:
taxons


Unnamed: 0,base_path,content_id,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon
0,/business/taxon,495afdb6-47be-4df1-8b38-91c8adb1eefc,Business,Business,,,,
1,/corporate-information,a544d48b-1e9e-47fb-b427-7a987c658c14,Corporate information,Corporate information,,,,
2,/crime-justice-and-law,ba951b09-5146-43be-87af-44075eac3ae9,"Crime, justice and law","Crime, justice and law",,,,
3,/defence,e491505c-77ae-45b2-84be-8c94b94f6a2b,Defence,Defence,,,,
4,/childcare-parenting/entering-staying-uk,ba3a9702-da22-487f-86c1-8334a730e559,Entering and staying in the UK,Entering and staying in the UK,,,,
5,/environment,3cf97f69-84de-41ae-bc7b-7e2cc238fa58,Environment,Environment,,,,
6,/going-and-being-abroad,9597c30a-605a-4e36-8bc1-47e5cdae41b3,Going and being abroad,Going and being abroad,,,,
7,/government/taxon,e48ab80a-de80-4e83-bf59-26316856a5f9,Government,Government,,,,
8,/health-and-social-care,8124ead8-8ebc-4faf-88ad-dd5cbcc92ba8,Health and social care,Health and social care,,,,
9,/housing-local-and-community,4794066e-e3cc-425e-8cc4-e7ff3edb4c39,"Housing, local and community","Housing, local and community",,,,


# Content data

These data were created in clean_content.py so that each row represents a single content-taxon pair. There can be multiple rows for a content item (content_id) if it has been tagged to multiple taxons (taxon_id).  

In [None]:
#read in content items file which was cleaned from raw using clean_content.py

content = pd.read_csv('../../data/clean_content.csv.gz', compression='gzip')

In [None]:
content.columns

## All content with no filtering by taxon

In [None]:
allcontent_taxons = pd.merge(
    left=content, 
    right=taxons, 
    left_on='taxon_id', #which taxon is the content item tagged to
    right_on='content_id', #what is the id of that taxon
    how='outer', #keep everything for checking merge
    indicator=True #so we can filter by match type
)

In [None]:
allcontent_taxons.shape

In [None]:
#tidy column names
allcontent_taxons.rename(columns={'base_path_x': 'base_path', 
                               'content_id_x': 'content_id'}, inplace=True)

#### All content with no filtering by taxon

In [None]:
#intersection of join only
allcontent_taxons = allcontent_taxons[allcontent_taxons._merge == 'both']

In [None]:
allcontent_taxons.shape

In [None]:
#remove duplicates if both content_id and taxon_id are the same. 
allcontent_taxons = allcontent_taxons.drop_duplicates(subset = ['content_id', 'taxon_id'])

In [None]:
allcontent_taxons.shape

## Filter by taxon to exclude specific taxons from prediction activities

Current approach: Take out World and Corporate top taxons   
Must consider that the data which we will predict on needs to come from the same population as training data and it is hard to filter the unlabelled data to remove World & Corporate (unless they are perfectly predicted by a meta var such as documnet type). It may be safer to keep them in the training data, predict on all data and act differently if World/Corporate is predicted?

In [None]:
taxons.shape

In [None]:
taxons = taxons[taxons.level1taxon != 'World']
print("Taxons shape after deleting 'World' top taxons: {}".format(taxons.shape))

In [None]:
taxons = taxons[taxons.level1taxon != 'Corporate information']
print("Taxons shape after deleting 'corporate information' top taxons: {}".format(taxons.shape))

Is it possible that corporate information has already been excluded from the taxons file? Need to re-consider this approach

In [None]:
content_taxons = pd.merge(
    left=content, 
    right=taxons, 
    left_on='taxon_id', 
    right_on='content_id', 
    how='outer', 
    indicator=True
)

In [None]:
print(content_taxons['_merge'].value_counts())
print("There are {} tagged content items/taxon combinations with a matching taxon"
      .format(content_taxons['_merge'].value_counts()[2]))
print("There are {} content items/taxon combinations without a matching taxon. Are these untagged content?"
      .format(content_taxons['_merge'].value_counts()[0]))
print("There are {} /taxons with nothing tagged to them"
      .format(content_taxons['_merge'].value_counts()[1]))


### Explore the left_only content

In [None]:
content_only_taxons = content_taxons[['base_path_x','combined_text','details','taxon_id']][content_taxons._merge == 'left_only']

In [None]:
print("There are {} taxons represented in the {} content item/taxon combinations which have no corresponding taxon in the taxon data"
      .format(content_only_taxons.taxon_id.nunique(), content_only_taxons.shape[0]))

In [None]:
print("There are {} content items/taxon combinations with missing taxon because these were removed during taxon_clean.py"
      .format(content_only_taxons[content_only_taxons.taxon_id.isnull()].shape[0]))

In [None]:
#save out for devs to check reason these do not have a match in taxon file
#content_only_taxons.to_csv('../../data/content_with_taxonid_no_matching_taxon.csv', 
                           #index=False)

In [None]:
# np.savetxt('../../data/taxonid_no_matching_taxon.gz', 
#            content_only_taxons.taxon_id.unique(),fmt='%5s', delimiter=',')

Devs did some spot checks on these and some of these taxons were not part of the topic taxonomy so did not have a match in the topic taxonomy file. Others are in the World branch of the taxonomy.

need to add these to untaggedd population


## Tidy the content-taxon df

In [None]:
content_taxons.shape

In [None]:
content_taxons.head()

#### tidy columns

In [None]:
content_taxons.columns

In [None]:
#drop some cols
content_taxons = content_taxons.drop(['Unnamed: 0', 'variable', 'base_path_y', 
                                      'content_id_y'], axis=1)

In [None]:
#rename some cols
content_taxons.rename(columns={'base_path_x': 'base_path', 
                               'content_id_x': 'content_id'}, inplace=True)

## Filter content-taxons data:
- remove duplicates
- remove mismatches from merge

In [None]:
#count duplicates
print("There are {} rows in the data before filtering".
      format(content_taxons.shape[0]))
print("There are {} unique content items in the data before filtering".
      format(content_taxons.content_id.nunique()))


In [None]:
# Drop any rows which were not perfectly matched in taxons and content

content_taxons_filtered = content_taxons[content_taxons._merge == 'both']

print("There are {} rows in the taxon-level data after filtering out mismatches".
      format(content_taxons_filtered.shape[0]))
print("There are {} unique content items in the taxon-level data after filtering out mismatches".
      format(content_taxons_filtered.content_id.nunique()))
print("There were {} rows dropped because of mismatching"
      .format(content_taxons.shape[0] - content_taxons_filtered.shape[0]))
print("There were {} unique content items dropped because of mismatching"
      .format(content_taxons.content_id.nunique() - content_taxons_filtered.content_id.nunique()))

In [None]:
print("Before removing mismatches, there were {} duplicates content items, both with matching content_id "
      "and taxon_id"
      .format(content_taxons[content_taxons.duplicated(['content_id', 'taxon_id'])].shape[0]))
print("After removing mismatches, there were {} duplicates content items, both with matching "
      "content_id and taxon_id"
      .format(content_taxons_filtered[content_taxons_filtered.
                                      duplicated(['content_id', 'taxon_id'])].shape[0]))


In [None]:
#drop duplicates
content_taxons_dedup = content_taxons_filtered.drop_duplicates(subset = ['content_id', 'taxon_id'])

In [None]:
print("There were {} additional rows dropped due to duplicate content_id/taxon_id combination"
      .format(content_taxons_filtered.shape[0] - content_taxons_dedup.shape[0]))
print("There were {} additional content items dropped due to duplicate content_id/taxon_id combination"
      .format(content_taxons_filtered.content_id.nunique() - content_taxons_dedup.content_id.nunique()))

In [None]:
#assert content_taxons_dedup.shape == (2029084, 18)
content_taxons_dedup.shape

In [None]:
#content_taxons_dedup.head()

# Taxons

In [None]:
#Count taxons by level1 and level2
mask = taxons['level1taxon'].notnull() & taxons['level2taxon'].isnull()

print("There are {} out of {} taxons with a level2"
      .format(sum(taxons['level2taxon'].notnull()), taxons.shape[0]))
print("There are {} out of {} taxons with a level1 tag, but no level2 tag"
      .format(sum(mask), taxons.shape[0]))
taxons

OH no! Think that taxon name should be next level for everything....

### How much stuff is only tagged to level 1?

In [None]:
mask = content_taxons_dedup['level1taxon'].notnull() & content_taxons_dedup['level2taxon'].isnull()

print("There are {} out of {} content items with no level1 tag"
      .format(sum(content_taxons_dedup['level1taxon'].isnull()), content_taxons_dedup.shape[0]))
print("There are {} out of {} content items with a level2 tag"
      .format(sum(content_taxons_dedup['level2taxon'].notnull()), content_taxons_dedup.shape[0]))
print("There are {} out of {} content items with a level1 tag, but no level2 tag"
      .format(sum(mask), content_taxons_dedup.shape[0]))

## Level1  taxons

Expecting 19 top taxons (plus `None`)

In [None]:
assert content_taxons_dedup.level1taxon.nunique() == 19

print("There are {} unique taxon names in the level 1 taxon"
      .format(content_taxons_dedup.level1taxon.nunique()))

set(content_taxons_dedup.level1taxon)

In [None]:
topfreq = content_taxons_dedup.groupby('level1taxon').size()
topfreq.sort_values(ascending=False)

In [None]:
topfreq.sort_values().plot(kind = 'barh', figsize=(20, 20))

In [None]:
top_doctype = pd.crosstab(content_taxons_dedup['document_type'], content_taxons_dedup['level1taxon'])
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(top_doctype, cmap = "YlGnBu")

In [None]:
top_pubapp = pd.crosstab(content_taxons_dedup['publishing_app'], content_taxons_dedup['level1taxon'])
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(top_pubapp, cmap = "YlGnBu")

In [None]:
content_taxons_dedup.groupby('level1taxon').size().sort_values(ascending=False)

### Level 2 taxons

In [None]:
#assert content_taxons_dedup.level2taxon.nunique() == 103

print("There are {} unique taxon names in the level 2 taxon"
      .format(content_taxons_dedup.level2taxon.nunique()))

In [None]:
second_freq = content_taxons_dedup.groupby('level2taxon').size().sort_values(ascending=False)

# Print as string to get around truncation

print(second_freq.to_string())



In [None]:
second_freq.sort_values().plot(kind = 'barh', figsize=(10,30))

In [None]:
level1_1 = pd.crosstab(content_taxons_dedup['level1taxon'], content_taxons_dedup['level1taxon'])

In [None]:
# fig, ax = plt.subplots(figsize=(10,5))
# sns.heatmap(level1_1, cmap = "YlGnBu")

### Mutually exclusive taxon combinations (level1 and level2)

In [None]:
#Only keep rows where level1/level2 combination is unique
level2_dedup = content_taxons_dedup.drop_duplicates(subset = ['content_id', 'level1taxon', 'level2taxon']).copy()
#Identify and drop rows where level2 is missing
mask= pd.notnull(level2_dedup['level2taxon'])
level2_tagged = level2_dedup[mask]

In [None]:
#concatenate the name of each level2taxon for a single content item
level2_tagged = level2_tagged.groupby('content_id')['level2taxon'].apply(','.join).reset_index()
print("there are {} mutually exclusive combinations of level2 taxon combinations.".format(level2_tagged.level2taxon.nunique()))

In [None]:
#Get frequency counts for each of the mutually exclusive taxon2 combinations
mutualex_freq = level2_tagged.groupby('level2taxon').size().sort_values(ascending=True)

#Keep those with higher frequency
mutualex_freq_top = mutualex_freq[mutualex_freq > 30]
print("There are {} mutually exclusive combinations of level2 taxon combinations populated with more than 30 content items".format(len(mutualex_freq_top)))

mutualex_freq_top.plot(kind = 'barh', figsize=(10,30))

In [None]:
# Print as string to get around truncation

print(mutualex_freq.to_string())

# Document type distribution

## Compare document type in untagged, raw tagged, deduped-filtered tagged

the untagged content appears to have a different distribution of document type compared to the tagged content, both before and after filtering the World/Corporate categories and deduplication. 

For example, relatively few guidance , policy paper and research documents are untagged. World_news_story, foi releases are over-represented in untagged data compared to tagged data.

This is likely to result in a drop inaccuracy when moving from modelling data to predicting for untagged data.

In [None]:
untagged.groupby('document_type').size().sort_values(ascending=True).plot(kind = 'barh', figsize=(20, 20))

In [None]:
allcontent_taxons.groupby('document_type').size().sort_values(ascending=True).plot(kind = 'barh', figsize=(20, 20))

In [None]:
content.groupby('document_type').size().sort_values(ascending=True).plot(kind = 'barh', figsize=(20, 20))

## Time series analyses of content type

What's the frequency of publications over time?

In [None]:
#untagged['first_published_at'].resample('Y').count().plot()

In [None]:
singlelabel = content_taxons_dedup.drop_duplicates('content_id').reset_index(drop=True)
print(singlelabel['first_published_at'][singlelabel['first_published_at'].str.contains('0001-01-01', na=False)])

singlelabel.drop(singlelabel.index[[58843]], inplace=True)
singlelabel['first_published_at'] = pd.to_datetime(singlelabel['first_published_at'])
singlelabel.index = singlelabel['first_published_at'] 

Focus: since 2000

In [None]:
fig = plt.figure(figsize=(15, 7))
plt.subplot(2, 1, 1)
lab=singlelabel['first_published_at'].resample('Y').count().plot()
lab.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))
plt.title('Labelled data')
plt.ylabel('Count')
lab.set_ylim([0, 30000])
lab.axes.get_xaxis().set_ticklabels([])
lab.set_xlabel('')

plt.subplot(2, 1, 2)
plt.title('Unlabelled data')
unlab = untagged['first_published_at'].resample('Y').count().plot()
unlab.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))
unlab.set_ylim([0, 30000])

fig.tight_layout()


### What's the publication frequency over time by document type?
### Unlabelled

In [None]:
grouped = untagged.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
count_by_year = grouped.unstack('document_type', fill_value=0)

In [None]:
#There are too many document types to plot on one chart so select the types with highest maximum
top_count = count_by_year.loc[:,count_by_year.max() > 500]
ax = top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
#ax = grouped.unstack('document_type', fill_value=0).plot()
#ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

in order to compare distribution of document types over time with the labelled data, this needs to be relative not absolute. So change into percent of documents published that year

In [None]:
df = count_by_year.stack().reset_index()
df.columns = ['date', 'document_type', 'percent']
bydoctype_year = df.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
bydoctype_pcts = bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

bydoctype_pcts.columns = bydoctype_pcts.columns.droplevel(0)
bydoctype_pcts.columns

In [None]:

top_pct = bydoctype_pcts.loc[:,bydoctype_pcts.max() > 15]
print(top_pct.columns.values)

# ax = top_pct.plot(kind='area', stacked=True)

# ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# plt.title("Labelled data")

### Labelled

In [None]:
l_grouped = singlelabel.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
l_count_by_year = l_grouped.unstack('document_type', fill_value=0)

In [None]:
#There are too many document types to plot on one chart so select the types with highest maximum
l_top_count = l_count_by_year.loc[:,l_count_by_year.max() > 1000]
ax = l_top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
df_lab = l_count_by_year.stack().reset_index()
df_lab.columns = ['date', 'document_type', 'percent']
l_bydoctype_year = df_lab.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
l_bydoctype_pcts = l_bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

l_bydoctype_pcts.columns = l_bydoctype_pcts.columns.droplevel(0)

In [None]:
l_top_pct = l_bydoctype_pcts[['correspondence', 'decision'
 , 'fatality_notice',  'foi_release',
  'guidance', 'independent_report', 'international_treaty', 'news_story', 'research', 'world_news_story']]

# ax = utop_pct.plot(kind='area', stacked=True)
# plt.title('Unlabelled')
# ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
type(l_bydoctype_pcts)

#### level1tagged compared to level2tagged

In [None]:
print(level2_dedup['first_published_at'][level2_dedup['first_published_at'].str.contains('0001-01-01', na=False)])
level2_dedup['first_published_at'] = level2_dedup['first_published_at'].str.replace('0001-01-01', '2001-01-01')

print("There were {} content item/taxons before removing duplicates".format(content_taxons_dedup.shape[0]))
print("There were {} content items, unique level2 taxon pairs after removing duplicates by content_id, level1taxon and level2taxon".format(level2_dedup.shape[0]))
mask= pd.notnull(level2_dedup['level2taxon'])
level1_tagged = level2_dedup[~mask].copy()
print("There are {} content items only tagged to level1".format(level1_tagged.shape[0]))
level2_tagged = level2_dedup[mask].copy()
print("There are {} content items tagged to level2 or lower".format(level2_tagged.shape[0]))

print("{} + {} = {}".format(level1_tagged.shape[0], level2_tagged.shape[0], (level1_tagged.shape[0] + level2_tagged.shape[0])) )

In [None]:
#level1
level1_tagged['first_published_at'] = pd.to_datetime(level1_tagged['first_published_at'])
level1_tagged.index = level1_tagged['first_published_at']


l1_grouped = level1_tagged.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
l1_count_by_year = l1_grouped.unstack('document_type', fill_value=0)
#There are too many document types to plot on one chart so select the types with highest maximum
l1_top_count = l1_count_by_year.loc[:,l1_count_by_year.max() > 1000]
ax = l1_top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
df_l1 = l1_count_by_year.stack().reset_index()
df_l1.columns = ['date', 'document_type', 'percent']
l1_bydoctype_year = df_l1.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
l1_bydoctype_pcts = l1_bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

l1_bydoctype_pcts.columns = l1_bydoctype_pcts.columns.droplevel(0)
l1_top_pct = l1_bydoctype_pcts.loc[:,l1_bydoctype_pcts.max() > 10]
l1_top_pct.columns.values

In [None]:
#level2
level2_tagged['first_published_at'] = pd.to_datetime(level2_tagged['first_published_at'])
level2_tagged.index = level2_tagged['first_published_at']


l2_grouped = level2_tagged.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
l2_count_by_year = l2_grouped.unstack('document_type', fill_value=0)
#There are too many document types to plot on one chart so select the types with highest maximum
l2_top_count = l2_count_by_year.loc[:,l2_count_by_year.max() > 1000]

#Graph (absolute numbers)
ax = l2_top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

#Relative
df_l2 = l2_count_by_year.stack().reset_index()
df_l2.columns = ['date', 'document_type', 'percent']
l2_bydoctype_year = df_l2.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
l2_bydoctype_pcts = l2_bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

l2_bydoctype_pcts.columns = l2_bydoctype_pcts.columns.droplevel(0)
l2_top_pct = l2_bydoctype_pcts[['decision', 'guidance', 'independent_report',
       'international_treaty', 'national_statistics', 'news_story',
       'policy_paper', 'press_release', 'research']]

In [None]:
#Compare document type distribution over time beween content items tagged to level1 only and those tagged to level2 or lower
lev1 = l1_top_pct.plot(kind='area', stacked=True)
lev1.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
lev1.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Percent level1")

lev2 = l2_top_pct.plot(kind='area', stacked=True)
lev2.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
lev2.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Percent level2")

## Compare labelled/unlabelled

In [None]:
#TO DO: SORT OUT SUBPLOTS
lab = l_top_pct.plot(kind='area', stacked=True)
lab.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
lab.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Percent unlabelled")

unlab = top_pct.plot(kind='area', stacked=True)
unlab.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2017-12-31'))
unlab.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Percent labelled")

To DO: 
- work out subplots
- Look at differences in publishin body/publishing app/ locale/tokenised sequence length over time  
- Tidy up notebook
