### Load requirements and data

In [5]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical, layer_utils, plot_model

from keras.layers import (Embedding, Input, Dense, Dropout, 
                          Activation, Conv1D, MaxPooling1D, Flatten, concatenate, Reshape)
from keras.models import Model, Sequential
from keras.optimizers import rmsprop
from keras.callbacks import TensorBoard, Callback, ModelCheckpoint
import keras.backend as K
from keras.losses import binary_crossentropy

from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py


### Environmental vars

In [6]:
DATADIR=os.getenv('DATADIR')
#DATADIR='/data' #this was put in for AWS run but doesn't work locally...

### Read in data
Content items tagged to level 2 taxons or lower in the topic taxonomy

In [7]:
labelled_level2 = pd.read_csv(os.path.join(DATADIR, 'labelled_level2.csv.gz'), dtype=object, compression='gzip')

#### clean up any World taxons leftover despite dropping relevant doctypes

In [8]:
#COLLAPSE World level2taxons
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

#creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

In [9]:
labelled_level2.shape

(173560, 23)

### drop news

In [10]:
labelled_level2[(labelled_level2['document_type'] == 'world_news_story')].shape

(3927, 23)

In [11]:
labelled_level2[(labelled_level2['document_type'] == 'news_story')].shape

(33214, 23)

In [12]:
nonews = labelled_level2[(labelled_level2['document_type'] != 'news_story')]

nonews = nonews[nonews['document_type'] != 'world_news_story']

In [13]:
nonews.shape

(136419, 23)

### Create dictionary mapping taxon codes to string labels

In [14]:
#Get the category numeric values (codes) and avoid zero-indexing
labels = nonews['level2taxon'].cat.codes + 1

#create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labels), nonews['level2taxon']))
labels_index

{1: 'Administrative justice reform',
 2: 'Adoption, fostering and surrogacy',
 3: 'Afghanistan',
 4: 'Armed Forces Covenant',
 5: 'Armed forces',
 6: 'Armed forces and Ministry of Defence reform',
 7: 'Armed forces support for activities in the UK',
 8: 'Arts and culture',
 9: 'Assessing environmental impact',
 10: 'Asylum',
 11: 'Attorney General guidance to the legal profession',
 12: 'Aviation',
 13: 'Benefits entitlement',
 14: 'Benefits for families',
 15: 'Biodiversity and ecosystems',
 16: 'Boating and inland waterways',
 17: 'Brexit',
 18: 'Brexit and the EU',
 19: 'British citizenship ',
 20: 'British nationals overseas',
 21: 'Business and enterprise',
 22: 'Business and the environment',
 23: 'Business tax',
 24: 'Byelaws',
 25: 'Carers and disability benefits',
 26: "Carers' health",
 27: 'Certificates, register offices, changes of name or gender',
 28: 'Charities, volunteering and honours',
 29: 'Child Benefit',
 30: 'Child maintenance reform',
 31: 'Childcare and early ye

In [15]:
print(len(labels_index))

210


### Create target/Y 

Note: when using the categorical_crossentropy loss, your targets should be in categorical format (e.g. if you have 10 classes, the target for each sample should be a 10-dimensional vector that is all-zeros expect for a 1 at the index corresponding to the class of the sample).

In multilabel learning, the joint set of binary classification tasks is expressed with label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values:  
the one, i.e. the non zero elements, corresponds to the subset of labels.  
An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.  
Producing multilabel data as a list of sets of labels may be more intuitive.

####  First reshape wide to get columns for each level2taxon and row number = number unique urls

In [16]:
#get a smaller copy of data for pivoting ease (think you can work from full data actually and other cols get droopedauto)

level2_reduced = nonews[['content_id', 'level2taxon', 'combined_text', 'title', 'description']].copy()

#how many level2taxons are there?
print('Number of unique level2taxons: {}'.format(level2_reduced.level2taxon.nunique()))

#count the number of taxons per content item into new column
level2_reduced['num_taxon_per_content'] = level2_reduced.groupby(["content_id"])['content_id'].transform("count")

#Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
level2_reduced['level2taxon_code'] = level2_reduced.level2taxon.astype('category').cat.codes + 1

Number of unique level2taxons: 210


In [17]:
#how many level2taxons are there?
print('Number of unique level2taxons: {}'.format(labelled_level2.level2taxon.nunique()))

#count the number of taxons per content item into new column
labelled_level2['num_taxon_per_content'] = labelled_level2.groupby(["content_id"])['content_id'].transform("count")

#Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

Number of unique level2taxons: 210


In [18]:
#reshape to wide per taxon and keep the combined text so indexing is consistent when splitting X from Y

multilabel = (level2_reduced.pivot_table(index=['content_id', 'combined_text', 'title', 'description' ], 
                  columns='level2taxon_code', 
                  values='num_taxon_per_content'))
print('level2reduced shape: {}'.format(level2_reduced.shape))
print('pivot table shape (no duplicates): {} '.format(multilabel.shape))

level2reduced shape: (136419, 7)
pivot table shape (no duplicates): (92338, 210) 


In [19]:
#THIS IS WHY INDEXING IS NOT ZERO-BASED
#convert the number_of_taxons_per_content values to 1, meaning there was an entry for this taxon and this content_id, 0 otherwise
binary_multilabel = multilabel.notnull().astype('int')

In [20]:
#will convert columns to an array of shape
print('Shape of Y multilabel array before train/val/test split:{}'.format(binary_multilabel[list(binary_multilabel.columns)].values.shape))

Shape of Y multilabel array before train/val/test split:(92338, 210)


In [21]:
#convert columns to an array. Each row represents a content item, each column an individual taxon
binary_multilabel = binary_multilabel[list(binary_multilabel.columns)].values
print('Example row of multilabel array {}'.format(binary_multilabel[2]))

Example row of multilabel array [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [22]:
type(binary_multilabel)

numpy.ndarray

In [23]:
binary_multilabel.shape

(92338, 210)

### Create combined_text data/X

In [24]:
#the pivot table has two indices
multilabel.index.names

FrozenList(['content_id', 'combined_text', 'title', 'description'])

In [25]:
#extract combined text index to array
texts = multilabel.index.get_level_values('combined_text')
texts.shape

(92338,)

### Tokenize combined text

In [26]:
vectorizer = TfidfVectorizer()

In [27]:
data = vectorizer.fit_transform(texts)

In [28]:
data

<92338x136452 sparse matrix of type '<class 'numpy.float64'>'
	with 11269725 stored elements in Compressed Sparse Row format>

In [29]:
print('Shape of label tensor:', binary_multilabel.shape)
print('Shape of data tensor:', data.shape)

Shape of label tensor: (92338, 210)
Shape of data tensor: (92338, 136452)


### Data split
- Training data = 80%
- Development data = 10%
- Test data = 10%

In [30]:
# shuffle data and standardise indices
indices = np.arange(data.shape[0])
print(indices)
np.random.seed(0)
np.random.shuffle(indices)
print(indices)

[    0     1     2 ..., 92335 92336 92337]
[ 3348 85080 18965 ..., 42613 43567 68268]


In [31]:
data = data[indices]
labels = binary_multilabel[indices]

In [32]:
nb_test_samples = int(0.1 * data.shape[0]) #validation split
print('nb_test samples:', nb_test_samples)

nb_dev_samples = int(0.2 * data.shape[0]) #validation split
print('nb_dev samples:', nb_dev_samples)

nb_training_samples = int(0.8 * data.shape[0]) #validation split
print('nb_training samples:', nb_training_samples)

nb_test samples: 9233
nb_dev samples: 18467
nb_training samples: 73870


In [33]:
x_train = data[:-nb_dev_samples]
print('Shape of x_train:', x_train.shape)


y_train = labels[:-nb_dev_samples]
print('Shape of y_train:', y_train.shape)

Shape of x_train: (73871, 136452)
Shape of y_train: (73871, 210)


In [34]:
x_dev = data[-nb_dev_samples:-nb_test_samples]
print('Shape of x_dev:', x_dev.shape)

y_dev = binary_multilabel[-nb_dev_samples:-nb_test_samples]
print('Shape of y_dev:', y_dev.shape)

Shape of x_dev: (9234, 136452)
Shape of y_dev: (9234, 210)


In [35]:
x_test = data[-nb_test_samples:]
print('Shape of x_test:', x_test.shape)


y_test = binary_multilabel[-nb_test_samples:]
print('Shape of y_test:', y_test.shape)

Shape of x_test: (9233, 136452)
Shape of y_test: (9233, 210)


In [36]:
def loadfile(filename, istrain):
    ifile = open(filename)
    i = 0
    A = []
    B = []
    I = []
    cls = []
    Icls = []
    for s in ifile:
        try:
            sbeg, send = s.split(" ", 1)
            if istrain:
                sbegsplit = sbeg.split(",")
                cls.extend([(int(x)-1) for x in sbegsplit])
                Icls.extend([i]*len(sbegsplit))
            send = send.rstrip()
            for ss in send.split(" "):
                a, b = ss.split(":")
                A.append(int(a)-1)
                B.append(float(b))
                I.append(i)
            i += 1
            if i % 10000 == 0:
                print(str(i)+' lines')
        except ValueError:
            print('Value Error: ', i)
    S = csc_matrix((B, (I, A)))
    if istrain:
        C = csc_matrix(([1]*len(cls), (Icls, cls)))
        return S, C
    return S

In [37]:
# to build model in a multi-label problem
def modelfit(S1, C1, S2, model, makeA1=False, verbosity=False):
    l = C1.shape[1]
    A2 = np.zeros((S2.shape[0], l))
    if makeA1:
        A1 = np.zeros((S1.shape[0], l))
    if hasattr(model, 'predict_proba'):
        for j in range(l):
            if np.any(C1[:, j] > 0):
                model.fit(S1, C1[:, j])
                A2[:, j] = model.predict_proba(S2)[:, 1]
                if makeA1:
                    A1[:, j] = model.predict_proba(S1)[:, 1]
                if verbosity:
                    print(j)
    else:
        for j in range(l):
            if np.any(C1[:, j] > 0):
                model.fit(S1, C1[:, j])
                A2[:, j] = model.predict(S2)
                if makeA1:
                    A1[:, j] = model.predict(S1)
                if verbosity:
                    print(j)
    if makeA1:
        return A2, A1
    return A2

In [38]:
# knn
def myknn(S, C, S2):
    Aknn  = np.zeros((S2.shape[0], C.shape[1]))
    Aknn1 = np.zeros((S2.shape[0], C.shape[1]))
    Aknn2 = np.zeros((S2.shape[0], C.shape[1]))
    Aknn3 = np.zeros((S2.shape[0], C.shape[1]))
    for i in range(S2.shape[0]):
        r = (S * S2[i, :].T).todense()
        indexessort = np.argsort(-np.array(r)[:, 0])
        Aknn[i, :]  = r[indexessort[0:50], :].T * C[indexessort[0:50], :]  # a = r.T * C
        Aknn1[i, :] = r[indexessort[0], :].T * C[indexessort[0], :]  # a = r.T * C
        Aknn2[i, :] = r[indexessort[1], :].T * C[indexessort[1], :]  # a = r.T * C
        Aknn3[i, :] = r[indexessort[2], :].T * C[indexessort[2], :]  # a = r.T * C
        if i % 1000 == 0:
            print(str(i)+' nn')
    return Aknn, Aknn1, Aknn2, Aknn3

In [39]:
# MAIN CODE
import sys
from scipy.sparse import *

print('the training set: loading')
S, C = loadfile(os.path.join(DATADIR, 'wise2014-train.libsvm'), True)
C = C.todense()
C = np.array(C)  # matrix -> array

#C is y multilabel array
#S is tfidf matri

the training set: loading
10000 lines
20000 lines
30000 lines
40000 lines
Value Error:  47368
50000 lines
60000 lines


In [40]:
S.shape

(64856, 301561)

C= y_train
S= x_train

In [41]:
x_train.shape

(73871, 136452)

In [46]:
print('knn - with validation set')
Aknnsmall, Aknn1small, Aknn2small, Aknn3small = myknn(x_train, y_train, x_dev)

knn - with validation set
0 nn
1000 nn
2000 nn
3000 nn
4000 nn
5000 nn
6000 nn
7000 nn
8000 nn
9000 nn


In [43]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
from sklearn import linear_model

print('logistic regression - with validation set')
AL1small = modelfit(x_train, y_train, x_dev, linear_model.LogisticRegression(penalty='l1', C=2.0, tol=0.001), makeA1=False, verbosity=False)
AL2small = modelfit(x_train, y_train, x_dev, linear_model.LogisticRegression(penalty='l1', C=6.0, tol=0.001), makeA1=False, verbosity=False)
AL3small = modelfit(x_train, y_train, x_dev, linear_model.LogisticRegression(penalty='l1', C=10.0, tol=0.001), makeA1=False, verbosity=False)

logistic regression - the validation set (the last articles in the training set)


In [45]:
print('ridge regression - with validation set')
AR1small = modelfit(x_train, y_train, x_dev, linear_model.Ridge(alpha=0.4), makeA1=False, verbosity=False)
AR2small = modelfit(x_train, y_train, x_dev, linear_model.Ridge(alpha=0.8), makeA1=False, verbosity=False)
AR3small = modelfit(x_train, y_train, x_dev, linear_model.Ridge(alpha=1.2), makeA1=False, verbosity=False)

ridge regression - with validation set


In [None]:
# print 'the test set: loading'
# Stest = loadfile('D:\\Competitions\\GreekMedia\\wise2014-test.libsvm', False)

print('knn - the test set')
Aknn, Aknn1, Aknn2, Aknn3 = myknn(x_train, y_train, x_test)

print('logistic regression - the test set')
AL1 = modelfit(x_train, y_train, x_test, linear_model.LogisticRegression(penalty='l1', C=2.0, tol=0.001), makeA1=False, verbosity=False)
AL2 = modelfit(x_train, y_train, x_test, linear_model.LogisticRegression(penalty='l1', C=6.0, tol=0.001), makeA1=False, verbosity=False)
AL3 = modelfit(x_train, y_train, x_test, linear_model.LogisticRegression(penalty='l1', C=10.0, tol=0.001), makeA1=False, verbosity=False)

In [None]:
print('ridge regression - the test set')
AR1 = modelfit(x_train, y_train, x_test, linear_model.Ridge(alpha=0.4), makeA1=False, verbosity=False)
AR2 = modelfit(x_train, y_train, x_test, linear_model.Ridge(alpha=0.8), makeA1=False, verbosity=False)
AR3 = modelfit(x_train, y_train, x_test, linear_model.Ridge(alpha=1.2), makeA1=False, verbosity=False)

ridge regression - the test set


In [None]:
print 'linear combinations...'
model = linear_model.Ridge(alpha=2.0)
AN = 0.65*AL2 + 0.35*AR2 # default

for j in range(C.shape[1]):
    if np.sum(C[m: , j])>0:
        Xsmall = np.vstack([AL1small[:, j], AL2small[:, j], AL3small[:, j],
                            AR1small[:, j], AR2small[:, j], AR3small[:, j],
                            Aknn1small[:, j], Aknn2small[:, j], Aknn3small[:, j],
                            Aknnsmall[:, j] ]).T
        X = np.vstack([AL1[:, j], AL2[:, j], AL3[:, j],
                            AR1[:, j], AR2[:, j], AR3[:, j],
                            Aknn1[:, j], Aknn2[:, j], Aknn3[:, j], Aknn[:, j] ]).T
        model.fit(Xsmall, C[m:, j])
        AN[:, j] = model.predict(X)
        # print 'done: ' + str(j)

print 'decision rule'
maxes = np.max(AN, axis=1)+0.000001
A = AN / maxes[:, np.newaxis]
A = 0 + (A > 0.55)

In [None]:
print 'solution - nonzero elements'
ofile = open('D:\\Competitions\\GreekMedia\\big_solution2.csv', 'wb')
linecount = 64857
ofile.write('ArticleId,Labels\n')
for i in range(A.shape[0]):
    Aclasses = np.nonzero(A[i, :])[0] + 1
    strtowrite = ''.join([' %g' % num for num in Aclasses])
    ofile.write(str(64858+i) + ',' + strtowrite[1:] + '\n')
ofile.close()

### Evaluate model

#### Training metrics

In [None]:
y_prob = model.predict([metax_train, titlex_train, descx_train, x_train])

In [None]:
y_prob.shape

In [None]:
y_pred = y_prob.copy()
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [None]:
f1_score(y_train, y_pred, average='micro')

In [None]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)

In [None]:
a = precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)

In [None]:
pd.DataFrame(list(a))

In [None]:
f1_byclass = pd.DataFrame((a)[2], columns=['f1'])


In [None]:
support_byclass = pd.DataFrame((a)[3], columns=['support'])

f1_byclass = pd.merge(
    left=f1_byclass, 
    right=support_byclass, 
    left_index=True,
    right_index=True,
    how='outer', 
    validate='one_to_one'
)

f1_byclass['index_col'] = f1_byclass.index

In [None]:
f1_byclass['level2taxon'] = f1_byclass['index_col'].map(labels_index).copy()

In [None]:
print("At p_threshold of {}, there were {} out of {} ({})% taxons with auto-tagged content in the training data"
      .format(P_THRESHOLD, 
              f1_byclass.loc[f1_byclass['f1'] > 0].shape[0], 
              y_pred.shape[1], 
              (f1_byclass.loc[f1_byclass['f1'] > 0].shape[0]/y_pred.shape[1])*100 ))

In [None]:
no_auto_content = f1_byclass.loc[f1_byclass['f1'] == 0]
no_auto_content = no_auto_content.set_index('level2taxon')

In [None]:
no_auto_content['support'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
classes_predictedto = f1_byclass.loc[f1_byclass['f1'] > 0]
classes_predictedto = classes_predictedto.set_index('level2taxon') 

In [None]:
classes_predictedto.plot.scatter(x='support', y='f1', figsize=(20, 10), xticks=np.arange(0, 9700, 100))

In [None]:
classes_predictedto['f1'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='macro', sample_weight=None)

#### Development set metrics

In [None]:
y_pred_dev = model.predict([metax_dev, titlex_dev, descx_dev, x_dev])

In [None]:
y_pred_dev[y_pred_dev>=P_THRESHOLD] = 1
y_pred_dev[y_pred_dev<P_THRESHOLD] = 0

In [None]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_dev, y_pred_dev, average=None, sample_weight=None)

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_dev, y_pred_dev, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

## Tag unlabelled content

In [None]:
def get_predictions(new_texts, df, level1taxon=False):
    #process data for model input
    
    new_sequences = tokenizer.texts_to_sequences(new_texts) #yield one sequence per input text

    new_word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(new_word_index))

    x_new = pad_sequences(new_sequences, maxlen= MAX_SEQUENCE_LENGTH) #MAX_SEQUENCE_LENGTH
    
    print('Shape of untagged tensor:', x_new.shape)
    
    #predict tag for untagged data
    y_pred_new = model.predict(x_new)
    
    #get model output into pandas & get a column to track index for later merge
    y_pred_new = pd.DataFrame(y_pred_new)
    y_pred_new['index_col'] = y_pred_new.index
    
    #Make long by taxon so easier to filter rows and examine effect of p_threshold
    y_pred_new = pd.melt(y_pred_new, id_vars=['index_col'],
                             var_name='level2taxon_code', value_name='probability')
    
    #get taxon names
    y_pred_new['level2taxon'] = y_pred_new['level2taxon_code'].map(labels_index)
    
    if level1taxon==False:
        #get the info about the content
        new_info = df[[ 'base_path', 'content_id', 'title', 'description', 
                   'document_type', 'publishing_app', 'locale']]
    else:
        new_info = df[[ 'base_path', 'content_id', 'title', 'description', 
                   'document_type', 'publishing_app', 'locale', 'level1taxon']]
    
    
    #merge content info with taxon prediction
    pred_new = pd.merge(
    left=new_info, 
    right=y_pred_new, 
    left_index=True,
    right_on='index_col',
    how='outer'
    )
    
    #drop the cols needed for mergingin and naming
    pred_new.drop(['index_col'], axis=1, inplace = True)
    
    #keep only rows where prob of taxon > 0.5
    
    
    return pred_new #.loc[pred_new['probability'] > P_THRESHOLD] #only return rows/samples where probability is hihger than threshold
    

### Untagged

In [None]:
#read in untagged content
untagged_raw = pd.read_csv(os.path.join(DATADIR, 'untagged_content.csv.gz'), dtype=object, compression='gzip')

In [None]:
untagged_raw.head()

In [None]:
new_texts = untagged_raw['combined_text']

In [None]:
pred_untagged = get_predictions(new_texts, untagged_raw)

In [None]:
#data is long by taxon
print('Number of unique content items: {}'.format(pred_untagged.content_id.nunique()))
print('Number of content items tagged to taxons with more than p_threshold: {}'.format(pred_untagged.shape))

In [None]:
pred_untagged.loc[(pred_untagged['probability'] > 0.65) & (pred_untagged['probability'] < 0.85)].sort_values(by='probability', ascending=False)

In [None]:
#write to csv
pred_untagged.to_csv(os.path.join(DATADIR, 'predictions_for_untagged_data_trainingdatatok.csv'), index=False)

In [None]:
# apply tokenizer to our text data
tokenizer.fit_on_texts(new_texts)

pred_untagged_refit_tok = get_predictions(new_texts, untagged_raw)


In [None]:
#write to csv
pred_untagged_refit_tok.to_csv(os.path.join(DATADIR, 'predictions_for_untagged_data_refittok.csv'), index=False)

### New data (untagged + old taxons)

old_taxons data has no combined text. This needs fixing in the data pipeline before being able to use these data for predictions

In [None]:
#read in untagged content
new_raw = pd.read_csv(os.path.join(DATADIR, 'new_content.csv'), dtype=object)

In [None]:
new_raw.shape

In [None]:
type(new_raw['combined_text'][0])

In [None]:
new_raw['combined_text'][0]

In [None]:
len(new_raw[new_raw['combined_text'].isna()])

In [None]:
(new_raw.loc[(new_raw['combined_text'].isna()) & (new_raw['untagged_type'] == 'old_taxons')]).shape

In [None]:
new_raw[new_raw.untagged_type == 'old_taxons']

In [None]:
#make a copy so you can edit data without needed to read in each time
new_df = new_raw.copy(deep=True)

In [None]:
pred_new = get_predictions(new_df )

In [None]:
#keep only rows where prob of taxon > 0.5
pred_new = pred_new.loc[pred_new['probability'] > 0.5]

In [None]:
#write to csv
pred_new.to_csv(os.path.join(DATADIR, 'predictions_for_new_data.csv'), index=False)

### Labelled at level1only

In [None]:
labelled_level1 = pd.read_csv(os.path.join(DATADIR, 'labelled_level1.csv'), dtype=object)

In [None]:
level1_texts = labelled_level1['combined_text']

In [None]:
#reset tokenizer to training data texts
tokenizer.fit_on_texts(texts)

In [None]:
pred_labelled_level1 = get_predictions(level1_texts, labelled_level1, level1taxon=True)

In [None]:
pred_labelled_level1.sort_values(by='probability', ascending=False)

In [None]:
#write to csv
pred_labelled_level1.to_csv(os.path.join(DATADIR, 'predictions_for_level1only.csv'), index=False)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='cnn.png', show_shapes=True)