In [None]:
!nvidia-smi

Fri May 21 01:59:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive

drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/custom-EM-BERT/prof_entity

/content/gdrive/MyDrive/custom-EM-BERT/prof_entity


In [1]:
pip install transformers

# Setup

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             classification_report)

# utils python file for custom data loading
import utils
from utils import load_data, create_sample

# make use of the transformers library from hugging face
# auto models and auto tokenizer allow for easy swap between models
# supports various models like BERT, GPT, ....
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer

In [None]:
# make use of colab's gpu
use_gpu = True
if use_gpu:
    device = 'cuda'
else:
    device = 'cpu'

# for installing cuda-enabled torch version
# pip uninstall torch
# pip cache purge
# pip install torch -f https://download.pytorch.org/whl/torch_stable.html

# DataLoader

In [None]:
# Load data
df, is_entity, not_entity, signal_present = load_data(
    data_fp='prof_entity/data',
    file_path_or_ext='xlsx',
    explore=False,
    sheet_name='News Articles'
)

# Create positive and negative samples
# samples are the the prior 3 sentences, and the current sentence
# ['prior 3 sentences', 'current sentence']
positive_samples = create_sample(df, is_entity, 3)
negative_samples = create_sample(df, not_entity, 3)

# ## Train-test split our data
positive_labels = [1] * len(positive_samples)
negative_labels = [0] * len(negative_samples)
train_labels = positive_labels + negative_labels
train_data = positive_samples + negative_samples
assert len(train_data) == len(train_labels)

train_seq, val_seq, train_labels, val_labels = train_test_split(
    train_data, train_labels, shuffle=True, test_size=0.2
)

# choose the model to use here
model_name = 'bert-base-uncased'

# Create pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
print(positive_samples[0])

['MeMe Live was founded in 2016 to enable users to watch live broadcasting anytime, anywhere and engage with the audience via live sessions The platform gives a chance to live stream the performances, broadcast what you are good at and let people enjoy based on the core values of integrity, innovation, pro-activeness and openness through a mobile broadcasting app Since its India launch, MeMe Live has been available on iOS as well as Android operating systems', 'It was later acquired by 17LIVE Group, the operator of Japan’s No.1 live-streaming platform, which claimed that the deal would consolidate the global live-streaming industry and expand into new markets upon the integration of the two companies’ platform resources, content creators, and users']


In [None]:
signal_present['entity (WIP)'].value_counts()

0.0    2633
1.0    1188
Name: entity (WIP), dtype: int64

In [None]:
print(len(not_entity))

2633


In [None]:
# Use a dataloader to manage and optimize the data-in for training
class EntityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


# Encode data
train_batch_first, train_batch_second = zip(*train_seq)
train_encodings = tokenizer(
    train_batch_first,
    train_batch_second,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

val_batch_first, val_batch_second = zip(*val_seq)
val_encodings = tokenizer(
    val_batch_first,
    val_batch_second,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

train_dataset = EntityDataset(train_encodings, train_labels)
val_dataset = EntityDataset(val_encodings, val_labels)

# ## Note: See Transformers datasets for instructions on how to local massive
# datasets from local files.

In [None]:
print(train_batch_first[0])

These insights enable providers to generate top-line revenue by identifying policies with potential for upsell or cross-sell, improve retention rates, reassign orphan policies, and optimize their books of business by generating real-time visibility of their risk portfolio Using Atidot, insurance executives are able to get a better understanding of current lapse rates and surrenders, reconfigure pricing and product bundling, and objectively and scientifically determine the accurate reserves and capital requirements for the company “We are honored to be included as one of the world’s most innovative insurtech companies and to be recognized by the industry for our solutions,” said Dror Katzav, CEO and Cofounder of Atidot


In [None]:
print(train_batch_second[0])

“We strive to make it easy for insurers to start generating insights, improve efficiency, and more effectively target existing and potential policyholders with new products and services - driving more revenue.”  Selected from over 1,000 companies by analysts and industry experts at FinTech Global, the finalists were recognized for their innovative use of technology to solve a significant industry problem, or to generate cost savings or efficiency improvements across the insurance value chain


# Training

In [None]:
# Perform training
# ## Additional metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='models/',
    do_train=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.001,
    logging_dir='train/logs',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    # fp16=True,
    # sharded_ddp='zero_dp_2'
)

# Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
# Train
trainer.train()

# Evaluate trained model
trainer.evaluate()

  


Step,Training Loss
500,0.5197
1000,0.3131
1500,0.1413
2000,0.0555
2500,0.0155
3000,0.0082
3500,0.0031


{'epoch': 10.0,
 'eval_loss': 1.3649530410766602,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': -87040,
 'eval_mem_gpu_peaked_delta': 679710720,
 'eval_runtime': 51.1988,
 'eval_samples_per_second': 14.922}

In [None]:
trainer.save_model(output_dir='/content/gdrive/MyDrive/custom-EM-BERT/prof_entity/models')

# Results

In [None]:
# Check classification report
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions[0], axis=1)
actuals = val_labels

print(classification_report(actuals, preds))

  


              precision    recall  f1-score   support

           0       0.90      0.89      0.89       551
           1       0.72      0.73      0.73       213

    accuracy                           0.85       764
   macro avg       0.81      0.81      0.81       764
weighted avg       0.85      0.85      0.85       764

