In [2]:
import numpy as np
import pandas as pd
import os
import re
import yaml
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

In [3]:
DATADIR = os.getenv('DATADIR')
DATADIR

'/data/2018-03-27'

In [4]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

model v2.0.0 was run on data dated /data/2018-03-27


## 1. Read in data

In [5]:
full_content = pd.read_csv(
    os.path.join(DATADIR, 'full_content.csv.gz'),
    dtype=object,
    compression='gzip', 
    usecols=['content_id']
)

In [6]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip', 
    usecols=['content_id']
)

In [7]:
unlabelled = pd.read_csv(
    os.path.join(DATADIR, 'unlabelled_predictions_meta.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [8]:
unlabelled_unique_content = unlabelled.drop_duplicates('content_id').copy()

In [9]:
prim_pub_map = pd.read_csv(
    os.path.join(DATADIR, 'bulkmap_pub_org_to_taxon.csv'),
    dtype=object,
    skipfooter=1,
    engine='python'
)

In [10]:
taxons = pd.read_csv(
    os.path.join(DATADIR, 'clean_taxons.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [11]:
taxon_names = taxons['taxon_name'].unique()

In [12]:
taxon_names.shape

(2471,)

### Primary publishing org

In [13]:
# sort out the fuzzy matching for taxon name
prim_pub_map['taxon_name'] = ''

for taxon_name in taxon_names:
    for index, row in prim_pub_map.iterrows():
        if row['Bulk Tag to']:
            fuzzy = row['Bulk Tag to']
        
            score = fuzz.token_sort_ratio(fuzzy, taxon_name)
        
            if score > 90:
                
                row['taxon_name'] = taxon_name
            

In [14]:
taxon_name_id_dict = dict(zip((taxons['taxon_name']),
                        taxons['content_id']))

In [15]:
prim_pub_map['taxon_id'] = prim_pub_map['taxon_name'].map(taxon_name_id_dict).copy()

In [16]:
def strip_prim_org(x):
    if x=='':
        return ''
    else:
        return re.sub('\"|}',"",x.split("=>")[1])

In [17]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].fillna('').copy()

In [18]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].\
                map(strip_prim_org)

In [19]:
prim_pub_map.head()

Unnamed: 0,primary_publishing_organisation,COUNTA of primary_publishing_organisation,Bulk Tag to,Level,taxon_name,taxon_id
0,,0,,,,
1,British Cattle Movement Service,1,,,,
2,Building Regulations Advisory Committee,1,Business regulation,Level 3 (Business),Business regulation,33bc0eed-62c7-4b0b-9a93-626c9e10c025
3,Commonwealth Scholarship Commission in the UK,1,,,,
4,Copyright Tribunal,1,Copyright,Level 3 (Crime Justice and Law),Copyright,7b2a45e2-2f20-4331-a2b6-de512b007a52


In [20]:
pub_taxon_dict = dict(zip((prim_pub_map['primary_publishing_organisation']),
                        prim_pub_map['taxon_name']))

In [21]:
unlabelled_unique_content['taxon2label_bulk'] = unlabelled_unique_content['primary_publishing_organisation'].map(pub_taxon_dict)

In [22]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].isnull()].shape

(15161, 18)

In [23]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].shape

(70273, 18)

In [24]:
bulk_tags = unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].copy()

In [25]:
bulk_tags['taxon2label'] = bulk_tags['taxon2label_bulk']

In [26]:
bulk_tags['prob'] = np.nan
bulk_tags['prob_cat'] = np.nan

In [27]:
bulk_tags['how_tagged'] = 'prim_org_bulk_tag'

In [28]:
bulk_tags.shape

(70273, 19)

### document type

In [29]:
doctype_taxon_dict = {'business_finance_support_scheme':'ccfc50f5-e193-4dac-9d78-50b3a8bb24c5',
'countryside_stewardship_grant':'9129d716-365b-44ba-9856-383423fe1e41',
'drug_safety_update':'51bbdf23-292a-4b74-8a66-f7db6b93b163',
'esi_fund':'2894668d-0c21-491a-9069-a271e67f6025',
'international_development_fund':'9fb30a53-70fb-4f1c-878b-0064b202d1ba',
'medical_safety_alert':'51bbdf23-292a-4b74-8a66-f7db6b93b163',
'residential_property_tribunal_decision':'0c46aba4-1986-4574-b957-734c6d104546',
'residential_property_tribunal_decision':'357110bb-cbc5-4708-9711-1b26e6c63e86'}

In [30]:
unlabelled_unique_content2 = unlabelled.drop_duplicates('content_id').copy()

In [31]:
unlabelled_unique_content2['taxon_id_bulk'] = unlabelled_unique_content2['document_type'].map(doctype_taxon_dict)

In [32]:
name_taxon_id_dict = dict(zip((taxons['content_id']),
                        taxons['taxon_name']))

In [33]:
unlabelled_unique_content2[unlabelled_unique_content2['taxon_id_bulk'].notna()]

Unnamed: 0,content_id,prob,taxon2label,base_path,description,document_type,document_type_gp,first_published_at,level1taxon,locale,primary_publishing_organisation,publishing_app,taxon_id,title,untagged_type,updated_at,prob_cat,taxon_id_bulk
10019,fdaa586a-62c8-4e94-90df-692a992308d9,6.721260203202688e-18,Administrative justice reform,/european-structural-investment-funds/sme-supp...,call to run project's enhancing the competitiv...,esi_fund,org_entities,2018-03-02 09:35:15.000,,en,,specialist-publisher,,sme support (business support): call in cumbri...,untagged,2018-03-26 15:57:52.046,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10020,89f79649-5099-4812-a401-0b2fa76927be,1.478241767199307e-16,Administrative justice reform,/european-structural-investment-funds/sme-supp...,call to run project's enhancing the competitiv...,esi_fund,org_entities,2018-03-02 09:41:11.000,,en,,specialist-publisher,,sme support: call in stoke-on-trent and staffo...,untagged,2018-03-26 15:57:31.956,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10021,2fd4d095-25de-41e9-82fa-6d40d93e91d5,1.6279733355772076e-21,Administrative justice reform,/european-structural-investment-funds/research...,call for project’s providing research and inno...,esi_fund,org_entities,2018-03-02 10:23:39.000,,en,,specialist-publisher,,research and innovation: call in new anglia (o...,untagged,2018-03-26 15:56:43.875,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10022,4f798819-f2e0-468b-b73e-caa3aad4c8a1,4.050076611864226e-21,Administrative justice reform,/european-structural-investment-funds/leeds-ci...,call to run a project to be delivered within t...,esi_fund,org_entities,2018-03-26 15:20:18.000,,en,,specialist-publisher,,leeds city region - neet programme (oc20s18p1047),untagged,2018-03-26 15:56:56.207,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10023,2b295c16-6631-41e4-ad4e-9877c6e10086,1.4117464372578674e-19,Administrative justice reform,/european-structural-investment-funds/research...,call for project’s providing research and inno...,esi_fund,org_entities,2018-03-02 10:16:27.000,,en,,specialist-publisher,,research and innovation: call in the south eas...,untagged,2018-03-26 15:56:42.068,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10024,5685c19d-9c00-4e24-80e4-77ab226cff73,1.1927940297111594e-18,Administrative justice reform,/european-structural-investment-funds/low-carb...,call to run project's supporting the shift tow...,esi_fund,org_entities,2018-03-02 08:42:09.000,,en,,specialist-publisher,,low carbon: call in stoke-on-trent and staffor...,untagged,2018-03-26 15:56:59.228,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10025,5cd5eaf4-d636-44a0-8097-7c812fe6c58e,5.661420567708403e-19,Administrative justice reform,/european-structural-investment-funds/sme-supp...,call to run a project enhancing the competitiv...,esi_fund,org_entities,2018-03-02 10:18:00.000,,en,,specialist-publisher,,sme support - access to finance: call in north...,untagged,2018-03-26 15:57:01.370,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10026,941fe7ac-9c1f-4dd3-b4bf-4be0fbf1eca0,4.799148058438732e-17,Administrative justice reform,/european-structural-investment-funds/technica...,call to run a local or national technical assi...,esi_fund,org_entities,2018-03-02 09:15:22.000,,en,,specialist-publisher,,technical assistance: call in england (oc00r18...,untagged,2018-03-26 15:57:32.454,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10027,f8ad9e86-f1f1-42e8-adfe-46ed5888b0eb,1.0482887690983224e-19,Administrative justice reform,/european-structural-investment-funds/sme-supp...,call to run a project to create and extend the...,esi_fund,org_entities,2018-03-02 09:13:07.000,,en,,specialist-publisher,,sme support - manufacturing: call in england (...,untagged,2018-03-26 15:57:50.748,<0.01,2894668d-0c21-491a-9069-a271e67f6025
10028,e55de35c-e471-426e-882a-befb3b35f733,5.371464811687127e-19,Administrative justice reform,/european-structural-investment-funds/sme-supp...,call to run project's enhancing the competitiv...,esi_fund,org_entities,2018-03-02 09:45:44.000,,en,,specialist-publisher,,sme support (high growth smes): call in sheffi...,untagged,2018-03-26 15:57:47.365,<0.01,2894668d-0c21-491a-9069-a271e67f6025


In [34]:
unlabelled_unique_content2['taxon_label_bulk_doctype'] = unlabelled_unique_content2['taxon_id_bulk'].map(name_taxon_id_dict)

In [35]:
unlabelled_unique_content2[unlabelled_unique_content2['taxon_label_bulk_doctype'].isnull()].shape

(85078, 19)

In [36]:
unlabelled_unique_content2[unlabelled_unique_content2['taxon_label_bulk_doctype'].notnull()].shape

(356, 19)

In [37]:
bulk_tags_doctype = unlabelled_unique_content2[unlabelled_unique_content2['taxon_label_bulk_doctype'].notnull()].copy()

In [38]:
bulk_tags_doctype['taxon2label'] = bulk_tags_doctype['taxon_label_bulk_doctype']

In [39]:
bulk_tags_doctype['prob'] = np.nan
bulk_tags_doctype['prob_cat'] = np.nan

In [40]:
bulk_tags_doctype['how_tagged'] = 'doctype_bulk_tag'

In [41]:
bulk_tags_doctype.shape

(356, 20)

### Add to algorithm tags

In [42]:
unlabelled['prob'] = pd.to_numeric(unlabelled['prob'])
predictions = unlabelled[unlabelled['prob'] > 0.5]

In [43]:
no_uk_economy = predictions.loc[predictions['taxon2label'] != 'UK economy'].copy()

In [44]:
no_uk_economy_comms = no_uk_economy.loc[no_uk_economy['taxon2label'] != 'Media and communications'].copy()

In [45]:
no_uk_economy_comms['how_tagged'] = 'algorithm_v2.0.0'

In [46]:
no_uk_economy_comms.shape

(88503, 18)

In [47]:
no_uk_economy_comms.content_id.nunique()

64323

In [48]:
no_uk_economy_comms.drop_duplicates(subset=['content_id', 'taxon2label']).content_id.nunique()

64323

In [50]:
print("the algorithm tagged {}  out of {} ({}%) in-scope items".format(
no_uk_economy_comms.content_id.nunique(), 
unlabelled.content_id.nunique(),
(no_uk_economy_comms.content_id.nunique())/unlabelled.content_id.nunique() * 100
))

the algorithm tagged 64323  out of 85434 (75.28969731020437%) in-scope items


In [51]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + no_uk_economy_comms.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + no_uk_economy_comms.content_id.nunique())/full_content.content_id.nunique() * 100
))

194531 items would be tagged to level 2 or lower out of 210154 (92.56592784339104%) in-scope items


In [52]:
print("There are {} items residul afer algorithm tag".format(
    unlabelled.content_id.nunique() - no_uk_economy_comms.content_id.nunique()))

There are 21111 items residul afer algorithm tag


### concatenate bulk tags to algorithm tags
These will go through rake task to remove superfluous ancesters

In [52]:
bulk_and_algorithm_tags = pd.concat([bulk_tags, bulk_tags_doctype, no_uk_economy_comms])

In [53]:
bulk_and_algorithm_tags.shape

(159132, 21)

In [54]:
bulk_and_algorithm_tags.groupby('how_tagged')['content_id'].nunique()

how_tagged
algorithm_v2.0.0     64323
doctype_bulk_tag       356
prim_org_bulk_tag    70273
Name: content_id, dtype: int64

In [55]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.drop_duplicates(subset=['content_id', 'taxon2label']).copy()

In [56]:
bulk_and_algorithm_tags.groupby('how_tagged')['content_id'].nunique()

how_tagged
algorithm_v2.0.0     59558
doctype_bulk_tag       356
prim_org_bulk_tag    70273
Name: content_id, dtype: int64

In [89]:
70273+356

70629

In [57]:
print("the algorithm and the bulk tagging made the same predictions on {} items".format(
    no_uk_economy_comms.content_id.nunique()-bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique()))

the algorithm and the bulk tagging made the same predictions on 4765 items


In [58]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.rename(index=str, columns={"taxon2label": "taxon_tag"})

In [59]:
bulk_and_algorithm_tags.shape

(150474, 21)

### get taxon id

In [60]:
bulk_and_algorithm_tags['taxon_id'] = bulk_and_algorithm_tags['taxon_tag'].map(taxon_name_id_dict)

### remove World tags

In [61]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.loc[bulk_and_algorithm_tags['taxon_tag'] != 'world_level1'].copy()

In [62]:
bulk_and_algorithm_tags.shape

(150316, 21)

In [63]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags[bulk_and_algorithm_tags['taxon_id'].notnull()].copy()

In [64]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id']].shape

(119960, 3)

In [65]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id', 'how_tagged']].head()

Unnamed: 0,content_id,taxon_tag,taxon_id,how_tagged
5972,a113dedd-4320-4186-b1af-888437c6aedb,Environment,71d37f3a-7c8c-4128-8763-2fd5b831b9b9,prim_org_bulk_tag
5973,5d8d7f5f-7631-11e4-a3cb-005056011aef,International aid and development,9fb30a53-70fb-4f1c-878b-0064b202d1ba,prim_org_bulk_tag
5974,5e2e074d-7631-11e4-a3cb-005056011aef,Health and social care,8124ead8-8ebc-4faf-88ad-dd5cbcc92ba8,prim_org_bulk_tag
5976,5f50cd4e-7631-11e4-a3cb-005056011aef,Maritime,4a9ab4d7-0d03-4c61-9e16-47787cbf53cd,prim_org_bulk_tag
5977,b60d9e65-dc51-4a5b-8fbc-b6b15652b6ba,World,91b8ef20-74e7-4552-880c-50e6d73c2ff9,prim_org_bulk_tag


In [66]:
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].head()

Unnamed: 0,base_path,content_id,description,document_type,document_type_gp,first_published_at,how_tagged,level1taxon,locale,primary_publishing_organisation,...,prob_cat,publishing_app,taxon_tag,taxon2label_bulk,taxon_id,taxon_id_bulk,taxon_label_bulk_doctype,title,untagged_type,updated_at
13121,/government/news/transformation-of-birmingham-...,51f13e1a-ca7c-4b5d-9b0c-123a6e0d9480,after an investment of £8.12m by hmcts birming...,press_release,news_and_announcements,2018-03-01 11:00:18.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,...,>=0.8,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,,,transformation of birmingham civil justice cen...,untagged,2018-03-12 16:04:23.928
16867,/government/news/hm-courts-tribunals-service-l...,ad7af755-4ae4-4644-a957-a949bf1cccec,hmcts will today (18 january 2018) launch a te...,press_release,news_and_announcements,2018-01-19 10:47:26.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,...,0.5-0.59,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,,,hm courts & tribunals service launches project...,untagged,2018-03-12 16:09:18.338
27701,/government/news/civilcrime-news-more-dates-fo...,2d6cf199-8bac-4dca-9446-b97f7b5d05e9,new dates announced in hm courts and tribunals...,news_story,news_and_announcements,2018-01-10 15:43:00.000,algorithm_v2.0.0,,en,Legal Aid Agency,...,0.7-0.79,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,,,civil/crime news: more dates for hmcts roadshows,untagged,2018-01-16 13:33:44.968
47881,/government/publications/appeal-a-judicial-rev...,04ff3b09-ce00-48a0-aa05-7c99e03ddfdc,use this index when submitting a 'core bundle'...,form,service,2016-08-31 23:00:00.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,...,>=0.8,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,,,appeal a judicial review from the administrati...,untagged,2018-03-21 15:11:25.946
48071,/government/publications/appeal-an-administrat...,cd6e19e9-f652-49be-b665-43aaeab48cb7,use this index when submitting a 'core bundle'...,form,service,2016-08-31 23:00:00.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,...,0.6-0.69,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,,,appeal an administrative judicial review or fr...,untagged,2018-03-21 15:11:46.329


In [67]:
bulk_and_algorithm_tags.groupby(['how_tagged']).size()

how_tagged
algorithm_v2.0.0     79687
doctype_bulk_tag       356
prim_org_bulk_tag    39917
dtype: int64

In [68]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id', 'how_tagged']].to_csv(os.path.join(DATADIR, 'bulk_and_algorithm_tags.csv.gz'),compression='gzip',index=False)

### how much content is tagged now?

In [72]:
print("{} out of {} ({}%) unlabelled content items were tagged either by bulk or by algorithm".format(
    bulk_and_algorithm_tags.content_id.nunique(),
    unlabelled.content_id.nunique(),
    bulk_and_algorithm_tags.content_id.nunique()/unlabelled.content_id.nunique()*100))

70585 out of 85434 (82.61933188192054%) unlabelled content items were tagged either by bulk or by algorithm


In [73]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled.content_id.nunique() - bulk_and_algorithm_tags.content_id.nunique()))

There would be a residual 14849 items that are hard-to-tag


In [74]:
bulk_and_algorithm_tags.groupby('how_tagged')['content_id'].nunique()

how_tagged
algorithm_v2.0.0     59498
doctype_bulk_tag       356
prim_org_bulk_tag    39917
Name: content_id, dtype: int64

In [90]:
39917+356

40273

In [75]:
print("the total number of items tagged to level 2 or lower would be {}".format(
labelled_level2.content_id.nunique() + bulk_and_algorithm_tags.content_id.nunique() ))

the total number of items tagged to level 2 or lower would be 200793


In [76]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + bulk_and_algorithm_tags.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + bulk_and_algorithm_tags.content_id.nunique())/full_content.content_id.nunique() * 100
))

200793 items would be tagged to level 2 or lower out of 210154 (95.54564747756407%) in-scope items


In [77]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique(), 
full_content.content_id.nunique(),
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique()/full_content.content_id.nunique() * 100
))

the ML would have tagged 59498  out of 210154 (28.31161909837548%) in-scope items


In [93]:
print("the bulk tagging would have tagged {}  out of {} ({}%) in-scope items".format(
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='prim_org_bulk_tag'].content_id.nunique() + bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='doctype_bulk_tag'].content_id.nunique(), 
full_content.content_id.nunique(),
(bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='prim_org_bulk_tag'].content_id.nunique() + bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='doctype_bulk_tag'].content_id.nunique())/full_content.content_id.nunique() * 100
))

the bulk tagging would have tagged 40273  out of 210154 (19.16356576605727%) in-scope items


In [94]:
print("the bulk tagging would have tagged {}  out of {} ({}%) in-scope items".format(
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='prim_org_bulk_tag'].content_id.nunique() + bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='doctype_bulk_tag'].content_id.nunique(), 
unlabelled.content_id.nunique(),
(bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='prim_org_bulk_tag'].content_id.nunique() + bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='doctype_bulk_tag'].content_id.nunique())/unlabelled.content_id.nunique() * 100
))

the bulk tagging would have tagged 40273  out of 85434 (47.139312217618276%) in-scope items


In [97]:
both_methods = no_uk_economy_comms.content_id.nunique()-bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique()

In [99]:
print("the algorithm and the bulk tagging made the same predictions on {} items".format(both_methods))

the algorithm and the bulk tagging made the same predictions on 4825 items


In [100]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique() + both_methods, 
full_content.content_id.nunique(),
(bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].content_id.nunique()+ both_methods)/full_content.content_id.nunique() * 100
))

the ML would have tagged 64323  out of 210154 (30.60755446006262%) in-scope items


### hard to tag

In [101]:
bulk_and_algorithm_tags_reduced = bulk_and_algorithm_tags[['content_id', 'base_path']].copy()

In [81]:
hard_to_tag = pd.merge(
    left=bulk_and_algorithm_tags_reduced,
    right=unlabelled,
    on='content_id',
    how='outer',
    indicator=True
)

In [82]:
hard_to_tag.groupby('_merge').size()

_merge
left_only            0
right_only     3379536
both          26590032
dtype: int64

In [102]:
hard_to_tag[hard_to_tag['_merge']=='right_only']

Unnamed: 0,content_id,base_path_x,prob,taxon2label,base_path_y,description,document_type,document_type_gp,first_published_at,level1taxon,locale,primary_publishing_organisation,publishing_app,taxon_id,title,untagged_type,updated_at,prob_cat,_merge
26590032,60320607-7631-11e4-a3cb-005056011aef,,7.039967e-10,Administrative justice reform,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590033,60320607-7631-11e4-a3cb-005056011aef,,1.144122e-08,"Adoption, fostering and surrogacy",/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590034,60320607-7631-11e4-a3cb-005056011aef,,7.909353e-07,Afghanistan,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590035,60320607-7631-11e4-a3cb-005056011aef,,6.987006e-06,Armed Forces Covenant,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590036,60320607-7631-11e4-a3cb-005056011aef,,6.374691e-10,Armed forces,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590037,60320607-7631-11e4-a3cb-005056011aef,,3.227462e-06,Armed forces and Ministry of Defence reform,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590038,60320607-7631-11e4-a3cb-005056011aef,,5.849790e-07,Armed forces support for activities in the UK,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590039,60320607-7631-11e4-a3cb-005056011aef,,5.486633e-02,Arts and culture,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,0.01-0.09,right_only
26590040,60320607-7631-11e4-a3cb-005056011aef,,1.378083e-08,Assessing environmental impact,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only
26590041,60320607-7631-11e4-a3cb-005056011aef,,2.225148e-08,Attorney General guidance to the legal profession,/government/news/capacity-building-for-the-mac...,british embassy launches project support for t...,world_news_story,news_and_announcements,2014-11-17 19:13:56.000,,en,,whitehall,,capacity building for the macedonian security ...,untagged,2017-08-24 16:27:39.284,<0.01,right_only


In [103]:
bulk_and_algorithm_tags[bulk_and_algorithm_tags['document_type']=='guidance'].content_id.nunique()

5671

In [104]:
hard_to_tag = hard_to_tag[hard_to_tag['_merge']=='right_only']

In [105]:
hard_to_tag[hard_to_tag['document_type']=='guidance'].content_id.nunique()

635

In [106]:
hard_to_tag.to_csv(os.path.join(DATADIR, 'hard_to_bulk_algorithm_tag.csv.gz'),compression='gzip',index=False)