Skip to content

Textlabelling is a tool built on spacy to label text for NER and train with deep learning

License

MIT, MIT licenses found

Licenses found

MIT
LICENSE
MIT
LICENSE.txt
Notifications You must be signed in to change notification settings

aakinlalu/textlabelling

Repository files navigation

%load_ext autoreload
%autoreload 2
from src.textlabelling.dataPrep import DataPrep
from src.textlabelling.labelling import Labelling
from src.textlabelling.nerstats import NerStats
from src.textlabelling.csvmodel import CSVModel
from src.textlabelling.tainner import TrainNer, save_model, evaluate
from src.textlabelling.dbconnect import Connect
from src.textlabelling.model import Model

import random

random.seed(0)
!pwd

1. Labelling Process

i. Data Labelling Container

TRAIN_DATA=[]

ii. Prepare the dataset for Labelling

Data_prep = DataPrep('xxxxxxxx.csv')
train, test = Data_prep.split_data()
Data_prep.filename
!head -n 6 xxxxxxxx.csv

iii. Label data for Training

trainset = DataPrep.text_generator(train)
number = 0
n_counter = 0
def main():
    text =next(trainset)
    text = text.lower()
    print('')
    print(text)
    print('')
    Label = nerlabelling.Labelling(text)
    for item in Label.token_to_tuple():
        print(item)

    result, counter = Label.text_entities_construct()
    
    TRAIN_DATA.append(result)
    return counter
counter = main()
number = number + 1
n_counter = n_counter + counter
#display.HTML("<p><b>Counter:</b><H1>"+str(number)+' '+str(n_counter)+"</H1></p>")
display.HTML("<table><tr><th><H1><center>Counter:</center></H1></th></tr><tr><th>Global</th><th>Internal</th></tr><tr><td><H1>"
             +str(number)+
             "</H1></td><td><H1>"
             +str(n_counter)+
             "</H1><td></tr></table>")

iv. Save Labelled data

Stats = NerStats()
Stats.save_labelled_data(data=TRAIN_DATA, file_name='labelled_data/labelleddata.pkl')
Stats.distribution_visualizer(data)

v. Load Train data

TRAIN_DATA2 = Stats.load_labelled_data('labelled_data/labelleddat.pkl')

vi. Distribution of Labelled data

data = Stats.data_distribution(TRAIN_DATA2)
data.items()
#### v. Fix Training data 
for idx, value in enumerate(TRAIN_DATA2):
    if idx == 1236:
        #if 'payments' in value[1]['entities'][0]:
            #print(idx,  value[1]['entities'])
             print(idx,  value)
TRAIN_DATA2[114] = ('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', {'entities': [(0, 29, 'Payments'), (116, 128, 'Payments')]})

2. Training Process

i. Train Model

model = 'model_vi'
n_accuracy = {}
Trainer = TrainNer(TRAIN_DATA2, n_iter=150)
nlp,losses = Trainer.train_model()
#n_accuracy[model]=100-losses['ner']

ii. Save Model

type(nlp)
NerStats.save_model(nlp, model)
n_accuracy

3. Evaluate

TEST_DATA = Stats.load_labelled_data('labelled_data/labelledtest.pkl')
evaluate(nlp, TEST_DATA)
### 4. 

3. TEST MODEL for PREDICTION

template = """
 select created_date, reference_ticket, nps_verbatim,nps_score from table'
"""
Model = Model('model_v1', template, 'config/config.ini','Redshift_prod')

i. Create table if it exists

Model.create_table('public.nps_table')

ii. Update template

Model.template = template

iii. Insert the dataset into a table in redshift for instance

Model.insert_to_redshift()

About

Textlabelling is a tool built on spacy to label text for NER and train with deep learning

Topics

Resources

License

MIT, MIT licenses found

Licenses found

MIT
LICENSE
MIT
LICENSE.txt

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published