In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_excel('./combined.xlsx')
df.head()


Unnamed: 0,info,id
0,286-69-1076,1
1,256-80-1815,1
2,398-21-8862,1
3,725-50-8395,1
4,882-45-5093,1


In [3]:
df = df.sample(frac = 1) #shuffle the data

In [4]:
df

Unnamed: 0,info,id
433052,BY561775B,5
73115,475-94-0575,1
231725,80DA09061,3
422774,BE687589P,5
515307,IOE-17-136-25320,6
...,...,...
117652,229437700155,2
296085,72DE97507,3
619453,239-600-830,7
132482,521019531858,2


In [5]:
df["info"] = df["info"].astype(str)

# Verify that the values in the column are now strings
print(df["info"].dtype)

object


In [6]:
y = pd.get_dummies(df['id'], prefix='label')
y

Unnamed: 0,label_1,label_2,label_3,label_4,label_5,label_6,label_7
433052,0,0,0,0,1,0,0
73115,1,0,0,0,0,0,0
231725,0,0,1,0,0,0,0
422774,0,0,0,0,1,0,0
515307,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
117652,0,1,0,0,0,0,0
296085,0,0,1,0,0,0,0
619453,0,0,0,0,0,0,1
132482,0,1,0,0,0,0,0


In [7]:
label = list(y.columns)
label

['label_1', 'label_2', 'label_3', 'label_4', 'label_5', 'label_6', 'label_7']

In [8]:
y = y.to_dict('index')
y

{433052: {'label_1': 0,
  'label_2': 0,
  'label_3': 0,
  'label_4': 0,
  'label_5': 1,
  'label_6': 0,
  'label_7': 0},
 73115: {'label_1': 1,
  'label_2': 0,
  'label_3': 0,
  'label_4': 0,
  'label_5': 0,
  'label_6': 0,
  'label_7': 0},
 231725: {'label_1': 0,
  'label_2': 0,
  'label_3': 1,
  'label_4': 0,
  'label_5': 0,
  'label_6': 0,
  'label_7': 0},
 422774: {'label_1': 0,
  'label_2': 0,
  'label_3': 0,
  'label_4': 0,
  'label_5': 1,
  'label_6': 0,
  'label_7': 0},
 515307: {'label_1': 0,
  'label_2': 0,
  'label_3': 0,
  'label_4': 0,
  'label_5': 0,
  'label_6': 1,
  'label_7': 0},
 349243: {'label_1': 0,
  'label_2': 0,
  'label_3': 0,
  'label_4': 1,
  'label_5': 0,
  'label_6': 0,
  'label_7': 0},
 195171: {'label_1': 0,
  'label_2': 1,
  'label_3': 0,
  'label_4': 0,
  'label_5': 0,
  'label_6': 0,
  'label_7': 0},
 334202: {'label_1': 0,
  'label_2': 0,
  'label_3': 0,
  'label_4': 1,
  'label_5': 0,
  'label_6': 0,
  'label_7': 0},
 304839: {'label_1': 0,
  'label_

In [9]:
dataset = list(zip(df['info'],[{'cats': cats} for cats in y.values()]))
print(dataset[1])

('475-94-0575', {'cats': {'label_1': 1, 'label_2': 0, 'label_3': 0, 'label_4': 0, 'label_5': 0, 'label_6': 0, 'label_7': 0}})


In [10]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataset, train_size=0.8, random_state=13)

In [11]:
import spacy

nlp = spacy.blank("en")

textcat = nlp.add_pipe("textcat_multilabel")
for i in label:
    textcat.add_label(i)

In [12]:
textcat.labels

('label_1', 'label_2', 'label_3', 'label_4', 'label_5', 'label_6', 'label_7')

In [13]:
nlp.analyze_pipes()

{'summary': {'textcat_multilabel': {'assigns': ['doc.cats'],
   'requires': [],
   'scores': ['cats_score',
    'cats_score_desc',
    'cats_micro_p',
    'cats_micro_r',
    'cats_micro_f',
    'cats_macro_p',
    'cats_macro_r',
    'cats_macro_f',
    'cats_macro_auc',
    'cats_f_per_type'],
   'retokenizes': False}},
 'problems': {'textcat_multilabel': []},
 'attrs': {'doc.cats': {'assigns': ['textcat_multilabel'], 'requires': []}}}

In [14]:
optimizer = nlp.begin_training()
iterations = 2

In [15]:
from spacy.util import minibatch, compounding
from spacy.training import Example
with nlp.select_pipes(enable="textcat_multilabel"):
    for j in range(iterations):
        losses = {}
        k = 0
        batches = minibatch(train_data, size = compounding(4.,32.,1.001))
        for batch in batches:
            text, annotations = zip(*batch)
            example = []
            for i in range(len(text)):
                doc = nlp.make_doc(text[i])
                example.append(Example.from_dict(doc, annotations[i]))
            nlp.update(example, sgd=optimizer, drop=0.2, losses = losses)
            print('Batch No: {} Loss = {}'.format(k, round(losses['textcat_multilabel'])))
            k += 1
        print("\n\n Completed Iterations : {} ".format(j))

Batch No: 0 Loss = 0
Batch No: 1 Loss = 1
Batch No: 2 Loss = 1
Batch No: 3 Loss = 1
Batch No: 4 Loss = 2
Batch No: 5 Loss = 2
Batch No: 6 Loss = 2
Batch No: 7 Loss = 3
Batch No: 8 Loss = 3
Batch No: 9 Loss = 3
Batch No: 10 Loss = 4
Batch No: 11 Loss = 4
Batch No: 12 Loss = 4
Batch No: 13 Loss = 4
Batch No: 14 Loss = 5
Batch No: 15 Loss = 5
Batch No: 16 Loss = 5
Batch No: 17 Loss = 5
Batch No: 18 Loss = 6
Batch No: 19 Loss = 6
Batch No: 20 Loss = 6
Batch No: 21 Loss = 6
Batch No: 22 Loss = 7
Batch No: 23 Loss = 7
Batch No: 24 Loss = 7
Batch No: 25 Loss = 7
Batch No: 26 Loss = 7
Batch No: 27 Loss = 7
Batch No: 28 Loss = 8
Batch No: 29 Loss = 8
Batch No: 30 Loss = 8
Batch No: 31 Loss = 8
Batch No: 32 Loss = 9
Batch No: 33 Loss = 9
Batch No: 34 Loss = 9
Batch No: 35 Loss = 9
Batch No: 36 Loss = 9
Batch No: 37 Loss = 10
Batch No: 38 Loss = 10
Batch No: 39 Loss = 10
Batch No: 40 Loss = 10
Batch No: 41 Loss = 10
Batch No: 42 Loss = 10
Batch No: 43 Loss = 10
Batch No: 44 Loss = 11
Batch No: 45

In [30]:
nlp.to_disk('models/')

NameError: name 'convert' is not defined

In [17]:
test = []
for texts, annotation in test_data:
    doc = nlp.make_doc(texts)
    test.append(Example.from_dict(doc, annotation))   

In [18]:
score = nlp.evaluate(test)
score

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'cats_score': 0.9990752057028921,
 'cats_score_desc': 'macro AUC',
 'cats_micro_p': 0.9905853780492161,
 'cats_micro_r': 0.9905853780492161,
 'cats_micro_f': 0.9905853780492161,
 'cats_macro_p': 0.9911619548307494,
 'cats_macro_r': 0.9905734597834328,
 'cats_macro_f': 0.9905658396309277,
 'cats_macro_auc': 0.9990752057028921,
 'cats_f_per_type': {'label_1': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'label_2': {'p': 1.0, 'r': 0.9340142184840292, 'f': 0.965881439295884},
  'label_3': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'label_4': {'p': 0.938133683815246, 'r': 1.0, 'f': 0.9680794381206104},
  'label_5': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'label_6': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'label_7': {'p': 1.0, 'r': 1.0, 'f': 1.0}},
 'cats_auc_per_type': {'label_1': 1.0,
  'label_2': 0.9967494413403392,
  'label_3': 1.0,
  'label_4': 0.9967769985799049,
  'label_5': 1.0,
  'label_6': 1.0,
  'label_7': 1.0},
 'speed': 25283.469255483426}

In [19]:
import json
print(json.dumps(score['cats_auc_per_type'], indent=4, sort_keys=True))

{
    "label_1": 1.0,
    "label_2": 0.9967494413403392,
    "label_3": 1.0,
    "label_4": 0.9967769985799049,
    "label_5": 1.0,
    "label_6": 1.0,
    "label_7": 1.0
}


In [38]:
texts = "355-02-0560"
doc = nlp(texts)
doc.cats

{'label_1': 0.9998552799224854,
 'label_2': 0.0002842078101821244,
 'label_3': 9.921845776261762e-05,
 'label_4': 0.0006776155205443501,
 'label_5': 1.7679730035524699e-06,
 'label_6': 1.362837338092504e-06,
 'label_7': 2.2498386897495948e-05}

In [29]:
from spacy.cli.train import train as spacy_train
config_path = "spacy_textcat/config.cfg"
output_model_path = "output/spacy_textcat"
spacy_train(config_path, output_path=output_model_path,
    overrides={
        "paths.train": "train.spacy",
        "paths.dev": "valid.spacy",
    },
)


[38;5;1m✘ Config file not found[0m
spacy_textcat/config.cfg



SystemExit: 1