In [1]:
import pandas as pd

# custom
# when imported sets a bunch of things in your spacy pipeline
import get_govorg_list
import govorg_matcher

# spacy
import spacy
from spacy_lookup import Entity

# Import the English language class
from spacy.lang.en import English

# make use of widescreen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# trial with a manageable number of GOV.UK pages
content_n = 1000

In [2]:
# available from GOV.UK data scientists
# data has been pre-processed for taxonomy work
# this loses us useful information such as capital letters
# will want to adjust pre-processing pipeline
df = pd.read_csv("data/11-02-19/labelled.csv",
                 usecols=["base_path", "content_id",
                          "description", "locale",
                          "title", "body", "combined_text"],
                nrows = content_n)

In [3]:
govorg_matcher.text_gov_org_match(df.at[3,"body"])

['The Charity Commission']

In [4]:
# https://stackoverflow.com/questions/31674557/how-to-append-rows-in-a-pandas-dataframe-in-a-for-loop
cols = ['base_path','content_id', 'gov_org']
lst = []
for index, row in df.head(10).iterrows():
    lst.append([row['base_path'], row['content_id'], govorg_matcher.text_gov_org_match(df.at[index,"combined_text"])])
df1 = pd.DataFrame(lst, columns=cols)
df1

Unnamed: 0,base_path,content_id,gov_org
0,/government/publications/list-of-psychologists...,04a0cc0d-0b9f-45ad-bf57-7c54cbab9df9,[]
1,/government/news/charity-commission-names-furt...,5fa49c52-7631-11e4-a3cb-005056011aef,[The Charity Commission]
2,/government/publications/trust-and-confidence-...,d0341424-12a1-4b4c-9045-2e74ba17f2d5,[The Charity Commission]
3,/government/speeches/william-shawcross-speech-...,9245dfca-4210-41d9-9ffd-7fcc35dc1642,[The Charity Commission]
4,/government/statistics/crime-statistics-focus-...,5fec046a-7631-11e4-a3cb-005056011aef,[]
5,/government/news/britain-honours-its-holocaust...,5b12e7a3-3db7-4710-862f-0d54ec6117b6,[Foreign & Commonwealth Office]
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,"[Department for Work and Pensions, Skills Fund..."
7,/government/publications/charities-holding-mov...,5fe33d80-7631-11e4-a3cb-005056011aef,[]
8,/government/statistics/english-indices-of-depr...,e38fc3a7-1b0f-46d8-b19e-69b6a3c38809,[]
9,/government/news/dcms-improves-efficiency-and-...,5d33a69f-7631-11e4-a3cb-005056011aef,"[UK Film Council, British Film Institute, UK S..."


In [5]:
# https://stackoverflow.com/questions/27263805/pandas-when-cell-contents-are-lists-create-a-row-for-each-element-in-the-list
# Actually we want it like this, as we need a unique id 
s = df1.apply(lambda x: pd.Series(x['gov_org']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'gov_org_name'

df1 = df1.drop('gov_org', axis=1).join(s)

df1

Unnamed: 0,base_path,content_id,gov_org_name
0,/government/publications/list-of-psychologists...,04a0cc0d-0b9f-45ad-bf57-7c54cbab9df9,
1,/government/news/charity-commission-names-furt...,5fa49c52-7631-11e4-a3cb-005056011aef,The Charity Commission
2,/government/publications/trust-and-confidence-...,d0341424-12a1-4b4c-9045-2e74ba17f2d5,The Charity Commission
3,/government/speeches/william-shawcross-speech-...,9245dfca-4210-41d9-9ffd-7fcc35dc1642,The Charity Commission
4,/government/statistics/crime-statistics-focus-...,5fec046a-7631-11e4-a3cb-005056011aef,
5,/government/news/britain-honours-its-holocaust...,5b12e7a3-3db7-4710-862f-0d54ec6117b6,Foreign & Commonwealth Office
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,Department for Work and Pensions
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,Skills Funding Agency
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,National Offender Management Service
7,/government/publications/charities-holding-mov...,5fe33d80-7631-11e4-a3cb-005056011aef,


In [6]:
# drop empty edges
df1.dropna(inplace=True)

# need id for gov_org
# call api, get dictionary to lookup and add id to it

In [8]:
govorg_dict = get_govorg_list.get_orgid_dict()

In [9]:
# create new col by looking up org_id from dict
df1['org_id'] = df1['gov_org_name'].map(govorg_dict)


df1.head()

Unnamed: 0,base_path,content_id,gov_org_name,org_id
1,/government/news/charity-commission-names-furt...,5fa49c52-7631-11e4-a3cb-005056011aef,The Charity Commission,D98
2,/government/publications/trust-and-confidence-...,d0341424-12a1-4b4c-9045-2e74ba17f2d5,The Charity Commission,D98
3,/government/speeches/william-shawcross-speech-...,9245dfca-4210-41d9-9ffd-7fcc35dc1642,The Charity Commission,D98
5,/government/news/britain-honours-its-holocaust...,5b12e7a3-3db7-4710-862f-0d54ec6117b6,Foreign & Commonwealth Office,D13
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,Department for Work and Pensions,D10
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,Skills Funding Agency,EA86
6,/government/publications/esf-funding-allocated...,5f5167fc-7631-11e4-a3cb-005056011aef,National Offender Management Service,EA70
9,/government/news/dcms-improves-efficiency-and-...,5d33a69f-7631-11e4-a3cb-005056011aef,UK Film Council,OT850
9,/government/news/dcms-improves-efficiency-and-...,5d33a69f-7631-11e4-a3cb-005056011aef,British Film Institute,PB189
9,/government/news/dcms-improves-efficiency-and-...,5d33a69f-7631-11e4-a3cb-005056011aef,UK Sport,PB185


In [None]:
# create edge list