In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import functools
%matplotlib inline

# Untagged data

These data were written out from the clean_content.py script where the taxons column was empty. 
- Here we assume the taxon column was empty because the content item has not been tagged.

In [2]:
#read in untagged content to describe content with no taxons
untagged = pd.read_csv('../../data/untagged_content.csv')

In [3]:
print("There are {} rows in the untagged content data".
      format(untagged.shape[0]))
print("There are {} unique content items in the untagged content data".
      format(untagged.content_id.nunique()))

There are 57337 rows in the untagged content data
There are 57123 unique content items in the untagged content data


In [4]:
untagged.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'description', 'details',
       'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'taxons', 'title',
       'body'],
      dtype='object')

In [5]:
#convert string dates to timestamp for time series analyses (see below)
print(type(untagged['first_published_at'][0]))
untagged['first_published_at'] = pd.to_datetime(untagged['first_published_at'])
print(type(untagged['first_published_at'][0]))

<class 'str'>
<class 'pandas._libs.tslib.Timestamp'>


In [6]:
#use timestamp as index in untagged data for plots
untagged.index = untagged['first_published_at'] 

# Taxon data

Taxons data is a row for each taxon with columns for the taxon_id/taxon title at each level. So, for example, if an item has only been tagged to level1 then level2 and subsequent levels will be missing. If an item was tagged to level3, the level2 and level1 columns have been filled recursively. 

A taxon in taxons is identified through content_id

In [7]:
#read in taxon file which was cleaned from raw using clean_taxons.py
taxons = pd.read_csv('../../data/clean_taxons.csv')

In [8]:
taxons.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'taxon_name', 'level1',
       'level2tax_id', 'level3tax_id', 'level4tax_id', 'level1taxon',
       'level2taxon', 'level3taxon', 'level4taxon'],
      dtype='object')

In [9]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']].copy()

#### This section needs to be moved to clean_taxons.py

In [10]:
#For top taxons (level1) ensure that taxon)name is in level1taxon column instead of Nan
taxons['level1taxon'] = taxons['level1taxon'].fillna(taxons['taxon_name'])

In [11]:
#function to combine boolean series into one

def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)

In [12]:
#make a copy of taxons for working with
taxonslevels = taxons.copy()
#define the condition 
cond = conjunction(taxonslevels['level2taxon'].isna(), taxonslevels['level1taxon'] != taxonslevels['taxon_name'])
#change the values of the column if the condition is met to the taxon-name, otherwise the original string
taxonslevels['level2taxon'] = np.where(cond, taxonslevels['taxon_name'], taxonslevels['level2taxon'])

In [13]:
cond = conjunction(taxons['level2taxon'] != taxons['taxon_name'], taxons['level3taxon'].isna(), taxons['level2taxon'].notnull())
taxonslevels['level3taxon'] = np.where(cond, taxonslevels['taxon_name'], taxonslevels['level3taxon'])

In [14]:
cond = conjunction(taxons['level3taxon'] != taxons['taxon_name'], taxons['level2taxon'] != taxons['taxon_name'], taxons['level4taxon'].isna(), taxons['level3taxon'].notnull())
taxonslevels['level4taxon'] = np.where(cond, taxonslevels['taxon_name'], taxonslevels['level4taxon'])

In [15]:
#create new column for last taxon level
taxonslevels['level5taxon'] = np.nan
cond = conjunction(taxons['level4taxon'] != taxons['taxon_name'], taxons['level3taxon'] != taxons['taxon_name'], taxons['level2taxon'] != taxons['taxon_name'], taxons['level4taxon'].notnull())
taxonslevels['level5taxon'] = np.where(cond, taxonslevels['taxon_name'], taxonslevels['level5taxon'])

In [16]:
#copy the working df back to taxons
taxons = taxonslevels.copy()


# Content data

These data were created in clean_content.py so that each row represents a single content-taxon pair. There can be multiple rows for a content item (content_id) if it has been tagged to multiple taxons (taxon_id).  

In [17]:
#read in content items file which was cleaned from raw using clean_content.py

content = pd.read_csv('../../data/clean_content.csv.gz', compression='gzip')

In [18]:
content.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'description', 'details',
       'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'title', 'body',
       'combined_text', 'variable', 'taxon_id'],
      dtype='object')

In [19]:
content.shape

(335615, 15)

In [20]:
content.content_id.nunique()

140103

## All content with no filtering by taxon

In [21]:
allcontent_taxons = pd.merge(
    left=content, 
    right=taxons, 
    left_on='taxon_id', #which taxon is the content item tagged to
    right_on='content_id', #what is the id of that taxon
    how='outer', #keep everything for checking merge
    indicator=True #so we can filter by match type
)

In [22]:
allcontent_taxons.shape

(336967, 24)

In [23]:
#tidy column names
allcontent_taxons.rename(columns={'base_path_x': 'base_path', 
                               'content_id_x': 'content_id'}, inplace=True)

In [24]:
print(allcontent_taxons['_merge'].value_counts())

both          232526
left_only     103089
right_only      1352
Name: _merge, dtype: int64


In [25]:
allcontent_taxons[allcontent_taxons._merge == 'left_only'].content_id.nunique()

54370

In [26]:
empty_taxons=allcontent_taxons[allcontent_taxons._merge == 'right_only']
empty_taxons.shape

(1352, 24)

#### All content with no filtering by taxon

In [27]:
#intersection of join only
allcontent_taxons = allcontent_taxons[allcontent_taxons._merge == 'both']

In [28]:
allcontent_taxons.shape

(232526, 24)

In [29]:
#remove duplicates if both content_id and taxon_id are the same. 
allcontent_taxons = allcontent_taxons.drop_duplicates(subset = ['content_id', 'taxon_id'])

In [30]:
allcontent_taxons.shape

(232149, 24)

In [31]:
allcontent_taxons.content_id.nunique()

128577

## Filter by taxon to exclude specific taxons from prediction activities

Current approach: Take out World and Corporate top taxons   
Must consider that the data which we will predict on needs to come from the same population as training data and it is hard to filter the unlabelled data to remove World & Corporate (unless they are perfectly predicted by a meta var such as documnet type). It may be safer to keep them in the training data, predict on all data and act differently if World/Corporate is predicted?

In [32]:
taxons.shape

(4530, 8)

In [33]:
taxons = taxons[taxons.level1taxon != 'World']
print("Taxons shape after deleting 'World' top taxons: {}".format(taxons.shape))

Taxons shape after deleting 'World' top taxons: (2028, 8)


In [34]:
taxons = taxons[taxons.level1taxon != 'Corporate information']
print("Taxons shape after deleting 'corporate information' top taxons: {}".format(taxons.shape))

Taxons shape after deleting 'corporate information' top taxons: (2027, 8)


Is it possible that corporate information has already been excluded from the taxons file? Need to re-consider this approach

In [61]:
content_taxons = pd.merge(
    left=content, 
    right=taxons, 
    left_on='taxon_id', 
    right_on='content_id', 
    how='outer', 
    indicator=True
)

In [62]:
print(content_taxons['_merge'].value_counts())
print("There are {} tagged content items/taxon combinations with a matching taxon"
      .format(content_taxons['_merge'].value_counts()[2]))
print("There are {} content items/taxon combinations without a matching taxon. Are these untagged content?"
      .format(content_taxons['_merge'].value_counts()[0]))
print("There are {} /taxons with nothing tagged to them"
      .format(content_taxons['_merge'].value_counts()[1]))


both          229461
left_only     106154
right_only       105
Name: _merge, dtype: int64
There are 229461 tagged content items/taxon combinations with a matching taxon
There are 106154 content items/taxon combinations without a matching taxon. Are these untagged content?
There are 105 /taxons with nothing tagged to them


In [63]:
empty_taxons_notworld=content_old_taxons = content_taxons[['base_path_y', 'content_id_y',
       'taxon_name', 'level1taxon', 'level2taxon', 'level3taxon',
       'level4taxon', 'level5taxon']][content_taxons._merge == 'right_only']

### Explore the left_only content

Need to add this to untagged data...

In [80]:
content_old_taxons = content_taxons[['base_path_x', 'content_id_x', 'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'title', 'taxon_id']][content_taxons._merge == 'left_only']

In [81]:
content_old_taxons

Unnamed: 0,base_path_x,content_id_x,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,taxon_id
0,/vehicle-tax,fa748fae-3de4-4266-ae85-0797ada3f40c,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,tax your vehicle,948b6dd4-45b3-45ab-a5c6-5dbce75542a6
1,/check-vehicle-tax,0889f128-e479-465f-b3e1-a3db6a3879cf,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,check if a vehicle is taxed,948b6dd4-45b3-45ab-a5c6-5dbce75542a6
2,/check-mot-history,ad5110e0-fa62-49d3-923f-d50101f12014,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,check the mot history of a vehicle,948b6dd4-45b3-45ab-a5c6-5dbce75542a6
3,/check-mot-status,dc57162b-59f4-4d0f-9b83-a67f74ffccf5,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,check the mot status of a vehicle,948b6dd4-45b3-45ab-a5c6-5dbce75542a6
4,/vehicle-tax-refund,fff88e3b-ae66-43e4-afd0-6fc1f227b452,answer,2016-02-29T09:24:10.000+00:00,en,,publisher,cancel your vehicle tax and get a refund,948b6dd4-45b3-45ab-a5c6-5dbce75542a6
135,/view-driving-licence,bee455d5-5a4f-440a-88be-eb65ae8fde7d,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,view or share your driving licence information,3d3e32a7-b847-48d4-a694-04092971544a
136,/dvla-change-address,d857dd97-fd22-483d-b595-bcb740a5f95e,answer,2016-02-29T09:24:10.000+00:00,en,,publisher,tell dvla you’ve changed address,3d3e32a7-b847-48d4-a694-04092971544a
137,/renew-driving-licence,da48bba8-7358-4e54-a89e-c618a5265448,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,renew your driving licence,3d3e32a7-b847-48d4-a694-04092971544a
138,/apply-online-to-replace-a-driving-licence,cfd02357-baea-4006-b537-408220c5a699,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,replace a lost stolen damaged or destroyed dri...,3d3e32a7-b847-48d4-a694-04092971544a
139,/apply-first-provisional-driving-licence,f725a60e-a666-4269-82b0-946ecfb84b7c,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,apply for your first provisional driving licence,3d3e32a7-b847-48d4-a694-04092971544a


In [82]:
#rename some cols
content_old_taxons.rename(columns={'base_path_x': 'content_base_path', 
                               'content_id_x': 'content_id', 
                                  'base_path_y':'taxon_base_path'}, inplace=True)

In [40]:
print("There are {} taxons represented in the {} content item/taxon combinations which have no corresponding taxon in the taxon data"
      .format(content_old_taxons.taxon_id.nunique(), content_old_taxons.shape[0]))

There are 2010 taxons represented in the 106154 content item/taxon combinations which have no corresponding taxon in the taxon data


In [41]:
print("There are {} content items/taxon combinations with missing taxon because these were removed during taxon_clean.py"
      .format(content_old_taxons[content_old_taxons.taxon_id.isnull()].shape[0]))

There are 0 content items/taxon combinations with missing taxon because these were removed during taxon_clean.py


In [42]:
#save out for devs to check reason these do not have a match in taxon file
#content_only_taxons.to_csv('../../data/content_with_taxonid_no_matching_taxon.csv', 
                           #index=False)

In [43]:
# np.savetxt('../../data/taxonid_no_matching_taxon.gz', 
#            content_only_taxons.taxon_id.unique(),fmt='%5s', delimiter=',')

Devs did some spot checks on these and some of these taxons were not part of the topic taxonomy so did not have a match in the topic taxonomy file. Others are in the World branch of the taxonomy.

need to add these to untaggedd population


## Tidy the content-taxon df

In [44]:
content_taxons.shape

(335720, 24)

#### tidy columns

In [45]:
content_taxons.columns

Index(['Unnamed: 0', 'base_path_x', 'content_id_x', 'description', 'details',
       'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'title', 'body',
       'combined_text', 'variable', 'taxon_id', 'base_path_y', 'content_id_y',
       'taxon_name', 'level1taxon', 'level2taxon', 'level3taxon',
       'level4taxon', 'level5taxon', '_merge'],
      dtype='object')

In [46]:
#drop some cols
content_taxons = content_taxons.drop(['Unnamed: 0', 'variable', 'base_path_y', 
                                      'content_id_y'], axis=1)

In [47]:
#rename some cols
content_taxons.rename(columns={'base_path_x': 'base_path', 
                               'content_id_x': 'content_id'}, inplace=True)

## Filter content-taxons data:
- remove duplicates
- remove mismatches from merge

In [48]:
#count duplicates
print("There are {} rows in the data before filtering".
      format(content_taxons.shape[0]))
print("There are {} unique content items in the data before filtering".
      format(content_taxons.content_id.nunique()))


There are 335720 rows in the data before filtering
There are 140103 unique content items in the data before filtering


In [49]:
# Drop any rows which were not perfectly matched in taxons and content

content_taxons_filtered = content_taxons[content_taxons._merge == 'both']

print("There are {} rows in the taxon-level data after filtering out mismatches".
      format(content_taxons_filtered.shape[0]))
print("There are {} unique content items in the taxon-level data after filtering out mismatches".
      format(content_taxons_filtered.content_id.nunique()))
print("There were {} rows dropped because of mismatching"
      .format(content_taxons.shape[0] - content_taxons_filtered.shape[0]))
print("There were {} unique content items dropped because of mismatching"
      .format(content_taxons.content_id.nunique() - content_taxons_filtered.content_id.nunique()))

There are 229461 rows in the taxon-level data after filtering out mismatches
There are 127320 unique content items in the taxon-level data after filtering out mismatches
There were 106259 rows dropped because of mismatching
There were 12783 unique content items dropped because of mismatching


In [50]:
print("Before removing mismatches, there were {} duplicates content items, both with matching content_id "
      "and taxon_id"
      .format(content_taxons[content_taxons.duplicated(['content_id', 'taxon_id'])].shape[0]))
print("After removing mismatches, there were {} duplicates content items, both with matching "
      "content_id and taxon_id"
      .format(content_taxons_filtered[content_taxons_filtered.
                                      duplicated(['content_id', 'taxon_id'])].shape[0]))


Before removing mismatches, there were 956 duplicates content items, both with matching content_id and taxon_id
After removing mismatches, there were 377 duplicates content items, both with matching content_id and taxon_id


In [51]:
#drop duplicates
content_taxons_dedup = content_taxons_filtered.drop_duplicates(subset = ['content_id', 'taxon_id'])

In [52]:
print("There were {} additional rows dropped due to duplicate content_id/taxon_id combination"
      .format(content_taxons_filtered.shape[0] - content_taxons_dedup.shape[0]))
print("There were {} additional content items dropped due to duplicate content_id/taxon_id combination"
      .format(content_taxons_filtered.content_id.nunique() - content_taxons_dedup.content_id.nunique()))

There were 377 additional rows dropped due to duplicate content_id/taxon_id combination
There were 0 additional content items dropped due to duplicate content_id/taxon_id combination


In [53]:
#assert content_taxons_dedup.shape == (2029084, 18)
content_taxons_dedup.shape

(229084, 20)

In [54]:
#content_taxons_dedup.head()

## Write out dataframes for analyses
- allcontent_taxons: all deduplicated labelled content with no filtering by taxon
- content_taxons_dedup: Deduplicated labelled content without World/Corporate taxons
- taxons: cleaner taxons data from that produced by clean_taxons.py
- content_old_taxons: content items with a taxon id which is not contained within the topic taxonomy taxons
- empty_taxons: Taxons with no content tagged to them, which are not World taxons

In [55]:
allcontent_taxons.to_csv('../../data/labelled.csv', index=False)

content_taxons_dedup.to_csv('../../data/filtered.csv', index=False)
                            
taxons.to_csv('../../data/taxons_cleaner.csv', index=False)
                            

In [68]:
content_old_taxons.to_csv('../../data/old_tags.csv', index=False)

In [57]:
empty_taxons_notworld.to_csv('../../data/empty_taxons.csv', index=False)