# BERT Text Classification Using Pytorch
## Classify any text using BERT provided by the Huggingface library
https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b
https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca

## Setting up the environment

* Install PyTorch https://pytorch.org/
    * In the anaconda prompt:
    
    ```conda install pytorch torchvision cudatoolkit=10.2 -c pytorch```
    ```pip3 install torchtext```


* Huggingface 'Transformers' library https://huggingface.co/transformers/


* Require Microsoft Visual C++ Redistributable from https://aka.ms/vs/16/release/vc_redist.x64.exe


## Importing Libraries

In [5]:
# https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

# Libraries

import matplotlib.pyplot as plt
import pandas as pd

import torch

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [40]:
# Model parameter

MAX_SEQ_LEN = 280
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

In [41]:
# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('text', text_field), ('label', label_field)]

In [48]:
# TabularDataset
source_folder="C\\Users\\cmcgu\\Documents\\Git Repos\\msc-dts-se-synoptic-project\\data\\"
train, valid, test = TabularDataset.splits(path=source_folder, train='hateval2019_en_train.csv', validation='hateval2019_en_dev.csv',
                                           test='hateval2019_en_test.csv', format='CSV', fields=fields, skip_header=True)

FileNotFoundError: [Errno 2] No such file or directory: 'C\\Users\\cmcgu\\Documents\\Git Repos\\msc-dts-se-synoptic-project\\data\\hateval2019_en_train.csv'

## Data Preparation

In [30]:
# https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca
    
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

train_df = pd.read_csv("data/train/hateval2019_en_train.csv")
train_df = train_df.drop(columns = ['TR', 'AG'])
train_df.head()

Unnamed: 0,id,text,HS
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1
1,202,Why would young fighting age men be the vast m...,1
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0
4,205,Orban in Brussels: European leaders are ignori...,0


In [31]:
dev_df = pd.read_csv("data/dev/hateval2019_en_dev.csv")
dev_df = dev_df.drop(columns = ['TR', 'AG'])
dev_df.head()

Unnamed: 0,id,text,HS
0,18201,I swear I’m getting to places just in the nick...,0
1,18202,I’m an immigrant — and Trump is right on immig...,0
2,18203,#IllegalImmigrants #IllegalAliens #ElectoralSy...,1
3,18204,@DRUDGE_REPORT We have our own invasion issues...,1
4,18205,Worker Charged With Sexually Molesting Eight C...,0


In [35]:
train_df = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df['HS'],
    'alpha':['a']*train_df.shape[0],
    'text': train_df['text'].replace(r'\n', ' ', regex=True)
})

train_df.head()

Unnamed: 0,id,label,alpha,text
0,0,1,a,"Hurray, saving us $$$ in so many ways @potus @..."
1,1,1,a,Why would young fighting age men be the vast m...
2,2,1,a,@KamalaHarris Illegals Dump their Kids at the ...
3,3,0,a,NY Times: 'Nearly All White' States Pose 'an A...
4,4,0,a,Orban in Brussels: European leaders are ignori...


In [36]:
dev_df = pd.DataFrame({
    'id':range(len(dev_df)),
    'label':dev_df['HS'],
    'alpha':['a']*dev_df.shape[0],
    'text': dev_df['text'].replace(r'\n', ' ', regex=True)
})

dev_df.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,I swear I’m getting to places just in the nick...
1,1,0,a,I’m an immigrant — and Trump is right on immig...
2,2,1,a,#IllegalImmigrants #IllegalAliens #ElectoralSy...
3,3,1,a,@DRUDGE_REPORT We have our own invasion issues...
4,4,0,a,Worker Charged With Sexually Molesting Eight C...


In [37]:
train_df.to_csv('data/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv('data/dev.tsv', sep='\t', index=False, header=False)