<a href="https://colab.research.google.com/github/anitha67/100DaysofMLCode/blob/master/textclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 31.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
import tokenizers
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
import logging
logging.basicConfig(level=logging.ERROR)
CUDA_LAUNCH_BLOCKING=1

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
class config:
    DEVICE = "cpu"
    MAX_LEN = 128
    BERT_PATH = "/content/gdrive/My Drive/Colab Notebooks/test_data/bertbaseuncased"
    MODEL_PATH = "/content/gdrive/My Drive/Colab Notebooks/test_data/bertbaseuncased/results/model_0.bin"
    TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True,truncation=True)

In [5]:
class BERTDataset:
    def __init__(self, text):
        self.text = text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation= True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [6]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids,return_dict=False)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [7]:
def inference(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_outputs.extend(torch.argmax(outputs,axis = 1).cpu().detach().numpy().tolist())
    return fin_outputs

In [8]:
def run(dfx):
    labels = {
    0: "Quantitative",
    1: "Qualitative"
}

    test_dataset =BERTDataset(
        text=dfx['note'].values
    )

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=16, num_workers=4
    )

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.load_state_dict(torch.load(config.MODEL_PATH))
    model.to(device)
    outputs = inference(test_data_loader, model, device)
    out = labels[int(outputs[0])]
    dfx['label'] = out
    dfx.to_csv("output.csv",index = False)
    print(dfx.head())
   # print("hello")

In [9]:
if __name__ == "__main__":
    lst = ["<p>URL input sent to PhishTank</p>"]
    dfx = pd.DataFrame(lst, columns =['note'])
    run(dfx)

  cpuset_checked))
Some weights of the model checkpoint at /content/gdrive/My Drive/Colab Notebooks/test_data/bertbaseuncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1/1 [00:00<00:00,  1.32it/s]

                                 note        label
0  <p>URL input sent to PhishTank</p>  Qualitative



