##  xgboost to classify govuk content to level2 taxons

### Load requirements and data

In [61]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

from keras.utils import to_categorical, layer_utils, plot_model
import keras.backend as K

from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.utils import class_weight
from sklearn.multiclass import OneVsRestClassifier

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

### Environmental vars

In [2]:
DATADIR=os.getenv('DATADIR')
#DATADIR='/data' #this was put in for AWS run but doesn't work locally...

## Hyperparameters

### Read in data
Content items tagged to level 2 taxons or lower in the topic taxonomy

In [3]:
labelled_level2 = pd.read_csv(os.path.join(DATADIR, 'labelled_level2.csv.gz'), dtype=object, compression='gzip')

In [4]:
labelled_level2.shape

(173560, 23)

In [5]:
labelled_level2['content_id'].nunique()

114048

#### clean up any World taxons leftover despite dropping relevant doctypes

In [6]:
#COLLAPSE World level2taxons
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

#creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

In [7]:
#count the number of content items per taxon into new column
labelled_level2['num_content_per_taxon'] = labelled_level2.groupby(["level2taxon"])['level2taxon'].transform("count")

In [8]:
labelled_level2['num_content_per_taxon'].describe()

count    173560.000000
mean       4574.207145
std        3682.635048
min           1.000000
25%        1500.000000
50%        3780.000000
75%        6156.000000
max       11717.000000
Name: num_content_per_taxon, dtype: float64

In [9]:
#number of rows in biggest level2 taxon -this is the target size for all other level2 taxons in resampling
max_content_freq = max(labelled_level2['num_content_per_taxon'])
max_content_freq

11717

### drop news

In [10]:
labelled_level2.shape

(173560, 24)

In [11]:
labelled_level2[(labelled_level2['document_type'] == 'world_news_story')].shape

(3927, 24)

In [12]:
labelled_level2[(labelled_level2['document_type'] == 'news_story')].shape

(33214, 24)

In [13]:
nonews = labelled_level2[(labelled_level2['document_type'] != 'news_story') &
                         (labelled_level2['document_type'] != 'world_news_story')]

In [14]:
nonews.shape

(136419, 24)

### Create dictionary mapping taxon codes to string labels

In [15]:
#Get the category numeric values (codes) and avoid zero-indexing
labels = nonews['level2taxon'].cat.codes + 1

#create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labels), nonews['level2taxon']))

In [16]:
#labels_index

In [17]:
print(len(labels_index))

210


### Create target/Y 

Note: when using the categorical_crossentropy loss, your targets should be in categorical format (e.g. if you have 10 classes, the target for each sample should be a 10-dimensional vector that is all-zeros expect for a 1 at the index corresponding to the class of the sample).

In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values:  
the one, i.e. the non zero elements, corresponds to the subset of labels.  
An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.  
Producing multilabel data as a list of sets of labels may be more intuitive.

####  First reshape wide to get columns for each level2taxon and row number = number unique urls

In [18]:
#get a smaller copy of data for pivoting ease (think you can work from full data actually and other cols get droopedauto)

level2_reduced = nonews[['content_id', 
                         'level2taxon', 
                         'combined_text', 
                         'title', 
                         'description',
                         'document_type', 
                            'first_published_at', 
                            'publishing_app', 
                            'primary_publishing_organisation']].copy()

#how many level2taxons are there?
print('Number of unique level2taxons: {}'.format(level2_reduced.level2taxon.nunique()))

#count the number of taxons per content item into new column
level2_reduced['num_taxon_per_content'] = level2_reduced.groupby(["content_id"])['content_id'].transform("count")

#Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
level2_reduced['level2taxon_code'] = level2_reduced.level2taxon.astype('category').cat.codes + 1

Number of unique level2taxons: 210


In [19]:
#how many level2taxons are there?
print('Number of unique level2taxons: {}'.format(labelled_level2.level2taxon.nunique()))

#count the number of taxons per content item into new column
labelled_level2['num_taxon_per_content'] = labelled_level2.groupby(["content_id"])['content_id'].transform("count")

#Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

Number of unique level2taxons: 210


In [20]:
#reshape to wide per taxon and keep the combined text so indexing is consistent when splitting X from Y

multilabel = (level2_reduced.pivot_table(index=['content_id', 
                                                'combined_text', 
                                                'title', 
                                                'description' 
                                                ] , columns='level2taxon_code', values='num_taxon_per_content'))
print('level2reduced shape: {}'.format(level2_reduced.shape))
print('pivot table shape (no duplicates): {} '.format(multilabel.shape))


level2reduced shape: (136419, 11)
pivot table shape (no duplicates): (92338, 210) 


In [21]:
multilabel.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            201, 202, 203, 204, 205, 206, 207, 208, 209, 210],
           dtype='int64', name='level2taxon_code', length=210)

In [22]:
multilabel.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,level2taxon_code,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
content_id,combined_text,title,description,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
00029fa4-9b60-4285-898c-85ae8a6367f5,emma jones - small business crown representative as small business crown representative emma is keen to help uk smes win government business. emma was appointed as small business crown representative in july 2016. she was selected for the role because of her wealth of experience in working with smes. she is the founder of small business support group enterprise nation and the co-founder of startup britain. emma’s work in her role as small business crown representative includes: working with government and the small business panel to identify the remaining barriers to smes doing business with the public sector supporting the launch and delivery of the campaign to help show that government is “open for business” for smes and helping them bid for and win more contracts increasing awareness among smaller businesses of opportunities to deliver on behalf of larger private sector firms who have secured government contracts working with government to identify new opportunities to get best value from smes getting support emma is keen to hear what small business have to say and wants to engage with as many smes as possible. so if you’re thinking about becoming a government supplier take a look at the events and opportunities below for how to get involved and gain support. events the leeds cross government sme roadshow - 24 november 2017 - is a great opportunity for smes to hear directly about the opportunities to sell to the public sector. more information about the event and how to register can be found here. webinars register for free for emma’s half-hour webinars offering advice on how to become a government supplier. a list of webinars coming up is featured below. blogs read emma’s blogs to gain useful insight updates and tips for smes and government buyers. these smes did it and so can you! prompt payment makes for good business 2017 a big year for small businesses calling central government buyers: emma can help you meet your target small business saturday dec 2016: top tips for selling to government selling to the public sector guide in partner with the crown commercial service emma has developed a guide for small businesses with tips on selling to government. read here . government is open for business ‘open for business’ is the government’s campaign to reach more smes as potential suppliers: to help and support them to become suppliers and to listen to how government can improve the process. for more information visit www.gov.uk/openforbusiness register with contracts finder to keep updated on new and upcoming contracts worth over £10 000. for inspiration on how other small business have grown and benefitted from being a supplier government read our case studies . if you would like to help in getting the message out that government is open for business then visit the resources page for ways in which you can support.,emma jones - small business crown representative,as small business crown representative emma is keen to help uk smes win government business.,,,,,,,,,,,...,,,,,,,,,,
00037b70-5b08-44c2-bf0a-fa8eb636a60b,land remediation: bringing brownfield sites back to use brochure showing uk expertise in land remediation outlining technologies systems and ideas used in the regeneration of industrial land. the uk was the first industrialised country in the world. the legacy of the industrial revolution is over 400 000 hectares of contaminated land. uk expertise in land remediation has been borne out of necessity. the department for international trade’s ( dit ) brochure provides an overview of the expertise gained from over 5 decades of experience in land remediation. the brochure includes information on: sector specialists urban regeneration spill response monitoring and validation corporate liability management innovation industry bodies how dit can help this was published originally by uk trade and investment which has since moved to the department for international trade ( dit ).,land remediation: bringing brownfield sites back to use,brochure showing uk expertise in land remediation outlining technologies systems and ideas used in the regeneration of industrial land.,,,,,,,,,,,...,,,,,,,,,,
00037ee5-7b5e-452d-a233-af2c134f5bce,steps 2 success:ni statistics from october 2014 to september 2016 details on the number of referrals and starts on the steps 2 success programme and the number of moves into employment up to 30 sept 2016 statistics presented include details on the number of referrals and starts to the steps 2 success programme up to 30 september 2016.,steps 2 success:ni statistics from october 2014 to september 2016,details on the number of referrals and starts on the steps 2 success programme and the number of moves into employment up to 30 sept 2016,,,,,,,,,,,...,,,,,,,,,,
0004c63d-ae16-432a-bb35-c0f949b1e27c,student support applications for higher education: september 2016 data includes the number of applications received and grants awarded. these monthly statistics present information on applications for student support and tuition fee loans and tuition fee grants which include data for welsh domiciled students (wherever they study) and eu domiciled students studying in wales.,student support applications for higher education: september 2016,data includes the number of applications received and grants awarded.,,,,,,,,,,,...,,,,,,,,,,
0005ac76-50fe-42f1-8168-8b6fc046e40f,advice for building owners: large-scale wall system test 2 advice for building owners on the large-scale wall system test with acm with a polyethylene filler cladding with stone wool insulation. the government is undertaking large scale testing of cladding systems to understand better how 3 different types of aluminium composite material ( acm ) panels behave in combination with 2 different types of insulation in a fire. this note sets out advice to building owners following the results of the large scale test for a wall system including: acm with unmodified polyethylene filler (category 3 in screening tests) stone wool insulation. this should be read alongside the government’s explanatory note on the large scale wall systems testing .,advice for building owners: large-scale wall system test 2,advice for building owners on the large-scale wall system test with acm with a polyethylene filler cladding with stone wool insulation.,,,,,,,,,,,...,,,,,,,,,,


In [23]:
multilabel.columns.astype('str')

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '201', '202', '203', '204', '205', '206', '207', '208', '209', '210'],
      dtype='object', name='level2taxon_code', length=210)

In [24]:
#THIS IS WHY INDEXING IS NOT ZERO-BASED
#convert the number_of_taxons_per_content values to 1, meaning there was an entry for this taxon and this content_id, 0 otherwise
binary_multilabel = multilabel.notnull().astype('int')

## Data Pre-Processing

In [25]:
total_size = binary_multilabel.shape[0]
total_size

92338

In [26]:
nb_test_samples = int(0.1 * total_size) #test split
print('nb_test samples:', nb_test_samples)

nb_dev_samples = int(0.2 * total_size) #dev split
print('nb_dev samples:', nb_dev_samples)

nb_training_samples = int(0.8 * total_size) #train split
print('nb_training samples:', nb_training_samples)

nb_test samples: 9233
nb_dev samples: 18467
nb_training samples: 73870


### Shuffle

In [27]:
for i in range(0,10):
    print(binary_multilabel.index[i][0])

00029fa4-9b60-4285-898c-85ae8a6367f5
00037b70-5b08-44c2-bf0a-fa8eb636a60b
00037ee5-7b5e-452d-a233-af2c134f5bce
0004c63d-ae16-432a-bb35-c0f949b1e27c
0005ac76-50fe-42f1-8168-8b6fc046e40f
0006811c-ad80-4cd0-a732-04cc983ec8c2
0008f82f-9713-4074-8793-0d266d53930c
000aa34d-c3c0-4176-ad8a-50e801056df1
000b6a38-c69a-4ac9-918b-717a79cbdad2
000b8c7e-4671-4586-9eff-97c0c374126b


00029fa4-9b60-4285-898c-85ae8a6367f5
00037b70-5b08-44c2-bf0a-fa8eb636a60b
00037ee5-7b5e-452d-a233-af2c134f5bce
0004c63d-ae16-432a-bb35-c0f949b1e27c
0005ac76-50fe-42f1-8168-8b6fc046e40f
0006811c-ad80-4cd0-a732-04cc983ec8c2
0008f82f-9713-4074-8793-0d266d53930c
000aa34d-c3c0-4176-ad8a-50e801056df1
000b6a38-c69a-4ac9-918b-717a79cbdad2
000b8c7e-4671-4586-9eff-97c0c374126b

In [28]:
from sklearn.utils import shuffle

In [29]:
binary_multilabel = shuffle(binary_multilabel,random_state=0)

In [30]:
for i in range(0,10):
    print(binary_multilabel.index[i][0])

1372d620-2c7b-4c06-a6e7-c60cacdb4d58
d8132578-37d9-4f47-a438-86530b0ff259
5c8fa231-7631-11e4-a3cb-005056011aef
5f2bc84b-6fb0-48f7-9fab-41387b1d72fe
5df423c9-7631-11e4-a3cb-005056011aef
a1c32b79-e6b9-40b1-8095-150727418ea1
5bfd33c7-63bb-42d0-a6c7-9c2c03739165
5dc916ad-7631-11e4-a3cb-005056011aef
dabfc6b3-d88c-458f-a9fb-f286b987509b
5c71ba56-7631-11e4-a3cb-005056011aef


df76ffdf-70d6-4a38-9d60-a1765c18914e
dca1f897-c8bd-4e35-a839-5953ee94d54e
3bec5cd0-76bd-48b1-924a-567bd3361ec0
5eb7cd3c-7631-11e4-a3cb-005056011aef
a67385c3-8562-4dc1-96ba-d96ff215943b
5e35118a-7631-11e4-a3cb-005056011aef
5feb658b-7631-11e4-a3cb-005056011aef
144a86f9-6902-444c-87bc-b389a6f3b275
5e139390-7631-11e4-a3cb-005056011aef
e5741923-bc21-46bd-8832-886706f59e81

### Upsample minority classes to address imbalance leading to ~2, 465, 570 rows of data!

Access taxon columns with indexing. 

In [31]:
print("[ENCODING] Taxon min indx:",binary_multilabel.columns[0],"Taxon max indx:",
      binary_multilabel.columns[len(binary_multilabel.columns)-1])

[ENCODING] Taxon min indx: 1 Taxon max indx: 210


In [32]:
binary_multilabel[1].shape

(92338,)

In [33]:
type(binary_multilabel.columns[0])

numpy.int64

In [34]:
### Array with indices to upsample

In [35]:
index = [binary_multilabel.index[i][0] for i in range(0,nb_training_samples)]
print(len(index))

73870


In [36]:
binary_multilabel[binary_multilabel[1]==1].loc[index].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,level2taxon_code,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
content_id,combined_text,title,description,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
714b7c4c-269a-40fd-b3d8-41eda3d5517a,merger of local justice areas in greater manchester seeks views on merging 8 local justice areas (ljas) into a single lja to be known as the greater manchester lja. there are 3 key reasons for considering a merger of the current 8 ljas: to improve the effectiveness of the delivery of justice by improving flexibility in dealing with cases to make better use of reduced resources to increase the opportunities for magistrates to retain experience and thus competence the judicial business group (jbg) must address the question of magistrates’ sittings against the background of falling court sittings in criminal jurisdiction. the jbg must also consider the resources available to hmcts and criminal justice agencies to ensure that justice can be delivered as effectively as possible with reduced resources. staffing within hmcts and other organisations is determined by the workload and has therefore reduced over recent years.,merger of local justice areas in greater manchester,seeks views on merging 8 local justice areas (ljas) into a single lja to be known as the greater manchester lja.,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5f617c08-7631-11e4-a3cb-005056011aef,gwent magistrates' courts: proposals for the future this is a consultation on a proposal to close abergavenny magistrates' court and caerphilly magistrates' court. both abergavenny magistrates’ court and caerphilly magistrates’ court are in need of restoration and hm courts and tribunals service would incur considerable costs in making necessary repairs. it is proposed that both courts close and the workload be absorbed by the other 2 magistrates’ courts in gwent - newport and cwmbran. this proposal aims to ensure our court estate is used more efficiently and the closure of the courts would offer hm courts and tribunals service savings of around £80 000 a year. this consultation seeks the views of local users judiciary magistracy staff criminal justice agency practitioners and elected representatives to better understand the impact that this proposal would have on the gwent community.,gwent magistrates' courts: proposals for the future,this is a consultation on a proposal to close abergavenny magistrates' court and caerphilly magistrates' court.,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5f641586-7631-11e4-a3cb-005056011aef,two appointments for the judicial appointments commission judge phillip sycamore and judge usha karu appointed as commissioners of the judicial appointments commission. the lord chancellor today announced the appointment by her majesty the queen of his honour judge phillip sycamore as a senior judicial commissioner and her honour judge usha karu as the circuit judge member of the judicial appointments commission (jac). the appointments of his and her honour are for a 3 year period commencing 9 june 2014 with the possibility subject to the provisions of the constitutional reform act 2005 of renewal for a total period not exceeding 10 years. as serving salaried judges these posts are unpaid. the jac is an independent commission that selects candidates for judicial office in courts and tribunals in england and wales and for some tribunals whose jurisdiction extends to scotland or northern ireland. the jac selects candidates for judicial office on merit through fair and open competition from the widest range of eligible candidates. the jac comprises a chairman and 14 commissioners. the jac has a senior management team comprising the chief executive and a director and is supported by 67 staff. biographies his honour judge phillip sycamore judge sycamore was appointed as a circuit judge in 2001 and is currently the chamber president of the first tier tribunal (health education and social care chamber) a judge of the upper tribunal (administrative appeals chamber) and a deputy high court judge. he was previously the liaison judge for the mental health review tribunal the president of the law society of england and wales from 1997 to 1998 and a recorder from 1999 to 2001. this appointment was made on the recommendation of the tribunal judges council unlike the other jac commissioners who are nominated by judges’ councils. her honour judge usha karu judge karu was appointed as a circuit judge in 2005 and currently sits at the inner london crown court. prior to her appointment to the full time judiciary she was a barrister by profession and is one of the two diversity and community relations judges for inner london crown court. she is also the coordinating judge for judicial mentors for recorders for the london area as well as the new recorders training judge at the inner london crown court and a judicial member of the mental health review tribunal (restricted patients panel). this appointment was made through open competition. in accordance with the original nolan recommendations there is a requirement for appointees’ political activity (if any is declared) to be made public. her honour judge karu has declared that she has not been involved in any political activity.,two appointments for the judicial appointments commission,judge phillip sycamore and judge usha karu appointed as commissioners of the judicial appointments commission.,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8ea3bdc9-05ff-4416-8874-248a13bc7610,merger of north sussex and west sussex local justice areas seeks views on merger of 2 local justice areas (ljas) of sussex northern and sussex western into 1 lja to be known as the west sussex lja. the aim of this consultation is to find views on proposals to merge the local justice areas of sussex (northern) and sussex (western) into one new west sussex local justice area. this will give greater flexibility in managing the caseload across west sussex whilst increasing the opportunities for magistrates to sit on a broader range of cases on a regular basis and maintain experience. it aims to reduce delays and provide a more consistent service to court users. there will also be no reduction in access to justice for court users who have to attend hearings. this will also enable more effective management of the business of the bench reducing the number of meetings that magistrates and support staff must attend.,merger of north sussex and west sussex local justice areas,seeks views on merger of 2 local justice areas (ljas) of sussex northern and sussex western into 1 lja to be known as the west sussex lja.,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5fa8e157-7631-11e4-a3cb-005056011aef,proposals to reform judicial review the government response to the joint committee on human rights’ (jchr) 13th report of the 2013 to 2014 session. the government has set out its view on the committee’s recommendations in respect of its proposed reforms to judicial review many of which are being taken forward through the criminal justice and courts bill. the government’s view is that the reforms being taken forward are a proportionate response to the concerns raised in the consultation judicial review – proposals for further reform. the government is clear that judicial review is and must remain a crucial check on the power of the state and should continue to be readily available where it’s necessary in the interests of justice. the reforms the government is pursuing are aimed at speeding up the process for people who have arguable grounds and a genuine case to put. further information government response to the consultation judicial review: proposals for further reform,proposals to reform judicial review,the government response to the joint committee on human rights’ (jchr) 13th report of the 2013 to 2014 session.,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Why are we deleting this?
del binary_multilabel.columns.name

In [38]:
#TAKES FOREVER TO RUN!
from sklearn.utils import resample

In [39]:
upsampled_training = pd.DataFrame()
upper = len(binary_multilabel.columns)+1

for taxon in range(1, upper):
    num_samples = binary_multilabel[binary_multilabel[taxon]==1].shape[0] 
    if num_samples<500:
        print("Taxon code:",taxon,"Taxon name:",labels_index[taxon])
        print("SMALL SUPPORT:",num_samples)
        df_minority = binary_multilabel[binary_multilabel[taxon]==1].loc[index]
        if not df_minority.empty:
        # Upsample minority class
            print(df_minority.shape)
            df_minority_upsampled = resample(df_minority, 
                                                 replace=True,     # sample with replacement
                                                 n_samples=(500),    # to match majority class, switch to max_content_freq if works
                                                 random_state=123) # reproducible results
            
            print("FIRST 5 IDs:",[df_minority_upsampled.index[i][0] for i in range(0,5)])

            # Combine majority class with upsampled minority class
            upsampled_training = pd.concat([upsampled_training, df_minority_upsampled])

            # Display new shape
            print("UPSAMPLING:",upsampled_training.shape)

upsampled_training = shuffle(upsampled_training,random_state=0)

Taxon code: 1 Taxon name: Administrative justice reform
SMALL SUPPORT: 11
(10, 210)
FIRST 5 IDs: ['5f641586-7631-11e4-a3cb-005056011aef', '5f641586-7631-11e4-a3cb-005056011aef', 'fd0b66df-bab6-4e8a-bd7b-bb12a8ca63ca', '5f617c08-7631-11e4-a3cb-005056011aef', '8ea3bdc9-05ff-4416-8874-248a13bc7610']
UPSAMPLING: (500, 210)
Taxon code: 2 Taxon name: Adoption, fostering and surrogacy
SMALL SUPPORT: 69
(63, 210)
FIRST 5 IDs: ['0bf1fd2f-1798-421c-b9d3-36ced36d075b', '12eb9924-2e33-4222-b823-3e0e6ce2924d', '0bf1fd2f-1798-421c-b9d3-36ced36d075b', '5f4b08f5-7631-11e4-a3cb-005056011aef', '668f7b53-8546-49be-80de-7685c5350653']
UPSAMPLING: (1000, 210)
Taxon code: 3 Taxon name: Afghanistan
SMALL SUPPORT: 81
(64, 210)
FIRST 5 IDs: ['5e9c7226-7631-11e4-a3cb-005056011aef', '5ebdf67c-7631-11e4-a3cb-005056011aef', '5e9c7226-7631-11e4-a3cb-005056011aef', '5e94adff-7631-11e4-a3cb-005056011aef', '5e29a0b2-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (1500, 210)
Taxon code: 4 Taxon name: Armed Forces Covenant
S

(41, 210)
FIRST 5 IDs: ['77a514da-8fa5-405b-b0aa-cd16020d6edd', '5e966702-7631-11e4-a3cb-005056011aef', '5e5d6180-7631-11e4-a3cb-005056011aef', 'f917915c-c52a-484c-b343-726c49327e3f', '5e135611-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (13500, 210)
Taxon code: 33 Taxon name: Civil justice reform
SMALL SUPPORT: 3
(3, 210)
FIRST 5 IDs: ['c2734a07-31c8-4306-bb1c-d8038dba326e', 'ce9ceea5-e8b5-497d-ae73-e8290dcb5a5d', 'c2734a07-31c8-4306-bb1c-d8038dba326e', 'c2734a07-31c8-4306-bb1c-d8038dba326e', 'c4445136-1bae-46e6-b8a1-55640dc14f80']
UPSAMPLING: (14000, 210)
Taxon code: 34 Taxon name: Civil service reform
SMALL SUPPORT: 358
(277, 210)
FIRST 5 IDs: ['ed6ef793-b9e8-47f6-b757-6714f9559f2f', 'ad5d0235-24e2-49c6-93ed-0952520f369d', 'b10785ef-5454-4648-9e5e-861fec0ccf52', '5fd93319-7631-11e4-a3cb-005056011aef', '74f36b27-bca1-45a6-a894-f602c1281de1']
UPSAMPLING: (14500, 210)
Taxon code: 36 Taxon name: Commercial fishing and fisheries
SMALL SUPPORT: 263
(215, 210)
FIRST 5 IDs: ['5f1f85f6-7631-11

(8, 210)
FIRST 5 IDs: ['5c7166fe-7631-11e4-a3cb-005056011aef', '5c838a0c-7631-11e4-a3cb-005056011aef', '5c7166fe-7631-11e4-a3cb-005056011aef', '5c838905-7631-11e4-a3cb-005056011aef', '5f1ad877-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (26500, 210)
Taxon code: 68 Taxon name: European funds
SMALL SUPPORT: 82
(61, 210)
FIRST 5 IDs: ['ed031242-6195-4d23-9588-9fb1d341db48', 'b99b2e66-9f5b-4325-b1ac-e46b13e8cdac', '5c98c569-7631-11e4-a3cb-005056011aef', 'fb09c7a2-9e74-4ff8-94f9-79ce8598170a', '6028fa6a-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (27000, 210)
Taxon code: 69 Taxon name: European single market
SMALL SUPPORT: 142
(105, 210)
FIRST 5 IDs: ['9f3d84a1-7243-46ae-8dcf-36abdf3bacb1', '5d60dfcf-7631-11e4-a3cb-005056011aef', 'ec8125be-2409-46a3-98cc-2664a81131e5', '9fea336a-e759-4301-8fa1-fa2e4ab499c6', '5c993e48-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (27500, 210)
Taxon code: 70 Taxon name: Expenses and employee benefits
SMALL SUPPORT: 8
(6, 210)
FIRST 5 IDs: ['f6046a19-8312-412c-aeaa-9

UPSAMPLING: (39000, 210)
Taxon code: 104 Taxon name: Land registration
SMALL SUPPORT: 140
(112, 210)
FIRST 5 IDs: ['5f6499f6-7631-11e4-a3cb-005056011aef', '5f50d72a-7631-11e4-a3cb-005056011aef', '305e0f09-a541-41b0-b317-5f8ba7148647', '5f66e6be-7631-11e4-a3cb-005056011aef', '5f4fe1c5-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (39500, 210)
Taxon code: 105 Taxon name: Lasting power of attorney, being in care and your financial affairs
SMALL SUPPORT: 21
(15, 210)
FIRST 5 IDs: ['ef1207f4-4cec-4448-988d-997d795793af', 'faa1492b-e7ad-42de-80e3-c5a2c9043972', 'ef1207f4-4cec-4448-988d-997d795793af', '136bfef2-354c-474b-bf97-a6d39ec4248b', 'f508898d-1ba0-46f7-b150-828166886d97']
UPSAMPLING: (40000, 210)
Taxon code: 106 Taxon name: Law and practice
SMALL SUPPORT: 55
(48, 210)
FIRST 5 IDs: ['875fbcdb-5d8e-4310-a281-2597ec8b9c40', 'd6ef1190-e58c-42b1-98f6-6629b16d25d6', '5fee6f5a-7631-11e4-a3cb-005056011aef', '5f65f4e4-7631-11e4-a3cb-005056011aef', '0b1014ce-b0ad-46c1-9753-ec9a8f21cb87']
UPSAMPLING

(28, 210)
FIRST 5 IDs: ['dfa28d6c-c181-4b96-af95-30f24d66ff28', '5d81b47b-7631-11e4-a3cb-005056011aef', '5d81b47b-7631-11e4-a3cb-005056011aef', '6bcb87c7-0d26-4fb0-8079-c2dfa09220f3', '52d01c9e-fa5c-4e26-b309-bae92dc1c4bc']
UPSAMPLING: (52000, 210)
Taxon code: 137 Taxon name: Payroll
SMALL SUPPORT: 40
(33, 210)
FIRST 5 IDs: ['eed1c39e-523c-4e54-8025-eed301b61e62', '5dd79407-0213-4675-9754-d6cfd1118f87', 'a4fa57a8-8a70-4b66-a7ba-22f5103b6378', 'a0a16869-906d-4b7e-8e43-bee9a60cc853', 'b8b5b409-a2cc-4b6c-972d-eb7072c21626']
UPSAMPLING: (52500, 210)
Taxon code: 140 Taxon name: Permanent stay in the UK
SMALL SUPPORT: 33
(26, 210)
FIRST 5 IDs: ['5ec23dca-7631-11e4-a3cb-005056011aef', '5ee540bf-7631-11e4-a3cb-005056011aef', '5ee540bf-7631-11e4-a3cb-005056011aef', '5ec20675-7631-11e4-a3cb-005056011aef', 'f4c6ac13-1769-4e0a-aba5-46eedaab2bcf']
UPSAMPLING: (53000, 210)
Taxon code: 142 Taxon name: Policing
SMALL SUPPORT: 487
(391, 210)
FIRST 5 IDs: ['5ff07d1d-7631-11e4-a3cb-005056011aef', '5f5e74

UPSAMPLING: (64500, 210)
Taxon code: 177 Taxon name: Tax evasion and avoidance
SMALL SUPPORT: 122
(86, 210)
FIRST 5 IDs: ['5d63d132-7631-11e4-a3cb-005056011aef', '5e5b0b50-7631-11e4-a3cb-005056011aef', 'af5df0b0-e289-4b43-971e-ad210505a148', '5e340b3a-7631-11e4-a3cb-005056011aef', '5ebdf67c-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (65000, 210)
Taxon code: 179 Taxon name: The Commonwealth
SMALL SUPPORT: 50
(42, 210)
FIRST 5 IDs: ['5ebf06c2-7631-11e4-a3cb-005056011aef', 'c34a1171-6177-490b-933c-b78f05336424', 'f2c7b7f3-f712-4b27-ae97-e60e9cdf626b', '5dc62156-7631-11e4-a3cb-005056011aef', '487eefe2-17d8-4b37-a117-93ce05e2469c']
UPSAMPLING: (65500, 210)
Taxon code: 180 Taxon name: Tourism
SMALL SUPPORT: 118
(94, 210)
FIRST 5 IDs: ['5d3c07de-7631-11e4-a3cb-005056011aef', 'cb809fec-0fb5-4146-bf43-9d5272765099', '0cd922e9-1cea-4adf-a7a5-cd3c545f45ae', '5d60e381-7631-11e4-a3cb-005056011aef', '5d339a90-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (66000, 210)
Taxon code: 182 Taxon name: Transport

### Doublecheck dataframe contents before merging.

In [40]:
binary_multilabel.shape

(92338, 210)

In [41]:
binary_multilabel.index[91770][0] # final sample before merging.

'7e4943f3-3acd-4929-bdcd-d6a254d4dec1'

In [42]:
binary_multilabel = pd.concat([binary_multilabel, upsampled_training])

In [43]:
binary_multilabel.index[total_size][0] # first sample of duplicated training data

'5db8ea74-7631-11e4-a3cb-005056011aef'

Do not remove index because the text data lives there.
**TODO** Consider reworking how datasets are set up at some point

In [44]:
binary_multilabel.to_csv(os.path.join(DATADIR, 'balanced_level2_training_set_sampled.csv.gz'), compression='gzip')

### LOAD OVERSAMPLED DATASET

In [45]:
balanced_df = pd.read_csv(os.path.join(DATADIR, 'balanced_level2_training_set_sampled.csv.gz'), dtype=object, compression='gzip')

In [46]:
balanced_df.shape

(169338, 214)

In [47]:
#will convert columns to an array of shape
print('Shape of Y multilabel array before train/val/test split:{}'.format(balanced_df[list(balanced_df.columns)].values.shape))

Shape of Y multilabel array before train/val/test split:(169338, 214)


In [48]:
#dont' overwirte blanced_df as it take sages to read in
balanced_df_taxons = balanced_df.iloc[:,4:215]

In [49]:
balanced_df_taxons.columns = balanced_df_taxons.columns.astype(int)

In [50]:
balanced_df_taxons = balanced_df_taxons.astype(int)

In [51]:
#convert columns to an array. Each row represents a content item, each column an individual taxon
binary_multilabel = balanced_df_taxons[list(balanced_df_taxons.columns)].values
print('Example row of multilabel array {}'.format(binary_multilabel[2]))

Example row of multilabel array [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [52]:
balanced_df.head()

Unnamed: 0,content_id,combined_text,title,description,1,2,3,4,5,6,...,201,202,203,204,205,206,207,208,209,210
0,1372d620-2c7b-4c06-a6e7-c60cacdb4d58,drug strategy 2017 the drug strategy 2017 sets...,drug strategy 2017,the drug strategy 2017 sets out how the govern...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,d8132578-37d9-4f47-a438-86530b0ff259,bees and ants to flourish while the cuckoo flo...,bees and ants to flourish while the cuckoo flo...,climate change research reveals species most a...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5c8fa231-7631-11e4-a3cb-005056011aef,defence training estate warcop: public access ...,defence training estate warcop: public access ...,scheduled public access to the defence trainin...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5f2bc84b-6fb0-48f7-9fab-41387b1d72fe,chevras machzikei mesifta: 2016 inquiry report...,chevras machzikei mesifta: 2016 inquiry report,chevras machzikei mesifta: charity commission ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5df423c9-7631-11e4-a3cb-005056011aef,rail franchising a statement about the refranc...,rail franchising,a statement about the refranchising programme ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Format metadata/X

In [53]:
#extract content_id index to df
meta1 = pd.DataFrame(balanced_df['content_id'])

In [54]:
print(meta1.shape)
meta1.head()

(169338, 1)


Unnamed: 0,content_id
0,1372d620-2c7b-4c06-a6e7-c60cacdb4d58
1,d8132578-37d9-4f47-a438-86530b0ff259
2,5c8fa231-7631-11e4-a3cb-005056011aef
3,5f2bc84b-6fb0-48f7-9fab-41387b1d72fe
4,5df423c9-7631-11e4-a3cb-005056011aef


In [55]:
metas = ['document_type','first_published_at','publishing_app','primary_publishing_organisation']

In [56]:
def build_index(x):
    index_dict = {}
    index_dict['index'] = 0
    for i,elem in enumerate(x):
        index_dict[elem] = i+1
    return index_dict

In [57]:
import time

In [58]:
#IF THIS FUNCTION TURNS OUT FASTER KEEP
#apply meta data to content
print("STARTED:",time.strftime("%H:%M:%S"))
for meta in metas:
    print("WORKON:",meta)
    meta1[meta] = meta1['content_id'].map(dict(zip(labelled_level2['content_id'], labelled_level2[meta])))
print("FINISHED:",time.strftime("%H:%M:%S"))

STARTED: 12:52:31
WORKON: document_type
WORKON: first_published_at
WORKON: publishing_app
WORKON: primary_publishing_organisation
FINISHED: 12:52:31


In [59]:
meta1 = meta1.replace(np.nan, '', regex=True) #conver nans to empty strings for labelencoder types
meta1.head()

Unnamed: 0,content_id,document_type,first_published_at,publishing_app,primary_publishing_organisation
0,1372d620-2c7b-4c06-a6e7-c60cacdb4d58,policy_paper,2017-07-13T23:00:26.000+00:00,whitehall,{'title': 'Home Office'}
1,d8132578-37d9-4f47-a438-86530b0ff259,press_release,2015-07-22T08:10:53.000+00:00,whitehall,{'title': 'Natural England'}
2,5c8fa231-7631-11e4-a3cb-005056011aef,guidance,2012-07-04T00:00:00.000+00:00,whitehall,{'title': 'Ministry of Defence'}
3,5f2bc84b-6fb0-48f7-9fab-41387b1d72fe,decision,2016-07-13T10:03:31.000+00:00,whitehall,{'title': 'The Charity Commission'}
4,5df423c9-7631-11e4-a3cb-005056011aef,oral_statement,2013-03-26T00:00:00.000+00:00,whitehall,{'title': 'Department for Transport'}


In [62]:
def to_cat_to_hot(column):
    doctype_encoder = LabelEncoder()
    new_col = column+"_cat"
    meta1[new_col] = doctype_encoder.fit_transform(meta1[column])
    tf.cast(meta1[new_col], tf.float32)
    return to_categorical(meta1[new_col])

dict_of_encodings = {}
for meta in metas:
    if meta != "first_published_at":
        print(meta)
        dict_of_encodings[meta] = to_cat_to_hot(meta)   

document_type
publishing_app
primary_publishing_organisation


In [63]:
dict_of_encodings

{'document_type': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'primary_publishing_organisation': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 'publishing_app': array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.]])}

In [64]:
type(meta1['first_published_at'][0])

str

In [65]:
meta1['first_published_at'] = pd.to_datetime(meta1['first_published_at'])
print(meta1['first_published_at'].shape)

(169338,)


In [66]:
first_published = np.array(meta1['first_published_at']).reshape(meta1['first_published_at'].shape[0], 1)

In [67]:
print(first_published.dtype,first_published.shape,type(first_published))

datetime64[ns] (169338, 1) <class 'numpy.ndarray'>


In [68]:
first_published[0]

array(['2017-07-13T23:00:26.000000000'], dtype='datetime64[ns]')

In [69]:
meta1['first_published_at'].loc[meta1['first_published_at'] < '1970']

10461   1955-01-01 00:00:00
17755   1969-07-24 00:00:00
50923   1961-02-27 00:00:00
51324   1965-04-05 23:00:00
86399   1963-02-28 00:00:00
Name: first_published_at, dtype: datetime64[ns]

In [70]:
first_published[first_published < np.datetime64('1970')]

  if __name__ == '__main__':


array(['1955-01-01T00:00:00.000000000', '1969-07-24T00:00:00.000000000',
       'NaT', '1961-02-27T00:00:00.000000000',
       '1965-04-05T23:00:00.000000000', '1963-02-28T00:00:00.000000000'], dtype='datetime64[ns]')

In [71]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

first_published_scaled =scaler.fit_transform(first_published)




In [72]:
from scipy import stats

print(stats.describe(first_published_scaled))


DescribeResult(nobs=169338, minmax=(array([ 0.]), array([ 1.])), mean=array([ 0.98993999]), variance=array([  7.61789979e-05]), skewness=array([-11.94071949]), kurtosis=array([ 1001.36422597]))


In [73]:
first_published[1]

array(['2015-07-22T08:10:53.000000000'], dtype='datetime64[ns]')

In [74]:
(np.datetime64('today', 'ns') - first_published[1]).astype('timedelta64[Y]') < np.timedelta64(1, 'Y')

array([False], dtype=bool)

In [75]:
last_year = np.where((np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]') < np.timedelta64(1, 'Y'), 1, 0)
last_2years = np.where((np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]') < np.timedelta64(2, 'Y'), 1, 0)
last_5years = np.where((np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]') < np.timedelta64(5, 'Y'), 1, 0)
olderthan5 = np.where((np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]') > np.timedelta64(5, 'Y'), 1, 0)

  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [76]:
print(last_year.sum())
len(last_year)

13659


169338

In [77]:
dict_of_encodings.keys()

dict_keys(['document_type', 'publishing_app', 'primary_publishing_organisation'])

In [78]:
meta = np.concatenate((dict_of_encodings['document_type'], 
                               dict_of_encodings['primary_publishing_organisation'], 
                               dict_of_encodings['publishing_app'], 
                      first_published_scaled,
                       last_year,
                       last_2years,
                       last_5years, 
                      olderthan5), 
                              axis=1)

In [79]:
meta.shape

(169338, 436)

In [80]:
nb_metavars = meta.shape[1]
print(nb_metavars)
print(meta.shape)

436
(169338, 436)


### Data split
- Training data = 80%
- Development data = 10%
- Test data = 10%

#### Original sizes, keep for reference.
    nb_test samples: 9177
    nb_dev samples: 18354
    nb_training samples: 73416

In [81]:
print(nb_training_samples,nb_dev_samples,nb_test_samples)

73870 18467 9233


In [82]:
def split(data,splits):
    l = []
    for (start,end) in splits:
        l.append(data[start:end])
    return tuple([x for x in l])

In [84]:
diff = len(meta1)-total_size+1
diff

77001

In [86]:
splits = [(0,-(nb_dev_samples+diff)),(-(nb_dev_samples+diff),-(nb_test_samples+diff)),(-(nb_test_samples+diff),total_size)]
re_split = [(total_size,len(meta1))]

In [87]:
meta_train, meta_dev, meta_test = split(meta,splits)
meta_resampled = split(meta,re_split)[0]
meta_train = np.concatenate([meta_train,meta_resampled],axis=0)
                                                                
                                                                  
y_train, y_dev, y_test = split(binary_multilabel,splits)
y_resampled = split(binary_multilabel,re_split)[0]
y_train = np.concatenate([y_train,y_resampled],axis=0)                                                             

In [88]:

print('Shape of metax_train:', meta_train.shape)

print('Shape of y_train:', y_train.shape)

Shape of metax_train: (150870, 436)
Shape of y_train: (150870, 210)


In [89]:

print('Shape of meta_dev:', meta_dev.shape)

print('Shape of y_dev:', y_dev.shape)

Shape of meta_dev: (9234, 436)
Shape of y_dev: (9234, 210)


In [90]:

print('Shape of metax_test:', meta_test.shape)

print('Shape of y_test:', y_test.shape)

Shape of metax_test: (9234, 436)
Shape of y_test: (9234, 210)


In [91]:
def f1(y_true, y_pred):
    """Use Recall  and precision metrics to calculate harmonic mean (F1 score).

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1 = 2*((precision*recall)/(precision+recall))
    
    return f1

### Create & fit model

In [127]:
ratios = np.empty((y_train.shape[1]))
for i in range(0, y_train.shape[1]):
    if np.sum(y_train[:, i]==1) > 0:
        ratio = float(np.sum(y_train[:, i]==0)/np.sum(y_train[:, i]==1))
    else:
        ratio = 1
    ratios = np.append(ratios, [ratio], axis=0)

In [133]:
#ratios = np.empty((y_train.shape[1]))
np.mean(ratios)

93.104897198227476

In [None]:
classif = OneVsRestClassifier(XGBClassifier(scale_pos_weight=10, n_jobs=32), n_jobs=-1)
classif.fit(meta_train, y_train) #, eval_set=[(meta_train, y_train), (meta_dev, y_dev)], eval_metric='error', verbose=True)

### Save results arrays

In [None]:
def to_file(array,name):
    df = pd.DataFrame(data = array.tolist(),columns=[i for i in range(1,211)])
    df.to_csv(os.path.join(DATADIR, name+'_results.csv.gz'),compression='gzip',index=False)

In [None]:
date_run = time.strftime("_%H%M_%d%m_")
date_run

In [None]:
y_prob = classif.predict(meta_train)

In [None]:
to_file(y_prob,"train"+date_run)

In [None]:
y_pred = y_prob.copy()
y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(y_train, y_pred, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(y_train, y_pred, average='weighted', sample_weight=None)))

In [None]:
y_prob_dev = classif.predict(meta_dev)

In [None]:
to_file(y_prob_dev,"dev"+date_run)

In [None]:
y_pred_dev = y_prob_dev.copy()
y_pred_dev[y_pred_dev>=0.5] = 1
y_pred_dev[y_pred_dev<0.5] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(y_dev, y_pred_dev, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)))

In [None]:
print('weightedmacro: {}'.format(precision_recall_fscore_support(y_dev, y_pred_dev, average=None, sample_weight=None)))

In [None]:
to_file(y_train,"true_train"+date_run)

In [None]:
to_file(y_dev,"true_dev"+date_run)