<a href="https://colab.research.google.com/github/atherfawaz/BERT-Supervised/blob/master/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supervised Learning with BERT

***Question*: Supervised Learning with BERT
The Astrological department believes that a person's astrological sign can be guessed from their behavior. An organization is collecting blog-posts of different people from various sources. You have been tasked to build a Deep Learning model that can use these posts data of individuals to predict which star group out of 12 does an individual belong to. You also need to tell the gender of that person.**

In [1]:
#from google.colab import drive
#drive.mount('/content/gdrive')
%cd /content/drive/My Drive/Ebryx/blogs_train

/content/drive/My Drive/Ebryx/blogs_train


In [None]:
!tar -xvf  'blogs_train.tar.xz'
%cd /content/drive/My Drive/Ebryx/blogs_train

# Dataset
The file name would contain the gender, age, occupation, and astrological sign of the blooger. For example, 4115891.male.24.Student.Leo.xml is one file. A single file will contain a set of blogs separated by date. To illustrate, this is what a sample file looks like:

```
<Blog>
  <date>31,May,2004</date>
    <post>
      Well, everyone got up and going this morning.  It's still raining, but that's okay with me.  Sort of suits my mood.  I could easily have stayed home in bed with my book and the cats.  This has been a lot of rain though!..
    </post>
</Blog
```



# Parsing the dataset
Parsing the dataset from separate files into a Pandas Dataframe for displaying and easy access. Some XML files contain encoding issues and the problematic contents of those files have been replaced by random number. While this could impact the accuracy of the model later, the effect would not be that big.

In [None]:
import os
import pandas as pd
import numpy as np
import codecs
from bs4 import BeautifulSoup
from progressbar import ProgressBar
pbar = ProgressBar()

print('PARSING FILES....')

FILES = os.listdir()
#print('File count: ', len(FILES))
#print(FILES)
#FILES = ['4115891.male.24.Student.Leo.xml', '4115958.male.16.Communications-Media.Libra.xml', '4116071.male.26.Arts.Sagittarius.xml', '4116243.female.24.Manufacturing.Sagittarius.xml']

posts_arr = []
sign_arr = []
gender_arr = []
age_arr = []
occupation_arr = []

for to_fetch in pbar(FILES):
    #print('Parsing file:', to_fetch)
    gender = to_fetch.split('.')[-5]
    age = to_fetch.split('.')[-4]
    occupation = to_fetch.split('.')[-3]
    sign = to_fetch.split('.')[-2]
    with codecs.open(to_fetch, 'r', encoding='utf-8', errors='ignore') as fp:
      soup = BeautifulSoup(fp, 'lxml',
                           from_encoding='utf8')
      posts = soup.find_all('post')
      #print(soup.prettify())
      for post in posts:
        clean_str = post.text
        clean_str = clean_str.replace('\r', '')
        clean_str = clean_str.replace('\n', '')
        posts_arr.append(clean_str)
        sign_arr.append(sign)
        age_arr.append(age)
        occupation_arr.append(occupation)
        gender_arr.append(gender)

df = pd.DataFrame({'Gender': gender_arr, 'Age': age_arr, 'Occupation': occupation_arr, 'Post': posts_arr, 'Sign': sign_arr})

#df.head(50)
df.to_csv('/content/drive/My Drive/Ebryx/dataset.csv', encoding='utf-8', index=False)

In [None]:
df.head(100)

# Loading BERT for finetuning

Taking help from the model implementation from huggingface and [this repository](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb#scrollTo=JrBr2YesGdO_).


In [None]:
!pip install transformers;

In [1]:
!nvidia-smi

Tue Aug 18 07:49:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Run from here.

In [1]:
# Importing the libraries needed
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from progressbar import ProgressBar
pbar = ProgressBar()

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
# Creating the dataset and dataloader for the neural network
df = pd.read_csv('/content/drive/My Drive/Ebryx/dataset.csv')
df = df[['Post', 'Sign', 'Gender']]

encode_dict = {}
gender_dict = {}

iden = np.identity(13, dtype=int)

def encode_cat(x):
    if x not in encode_dict.keys():
      encode_dict[x]=len(encode_dict)
    return encode_dict[x]

def encode_gender(x):
  if x == 'male':
    return 0
  else:
    return 1

def get_onehot(x):
  return iden[x]

df['Sign'] = df['Sign'].apply(lambda x: encode_cat(x))
df['Sign'] = df['Sign'].apply(lambda x: get_onehot(x))
df['Gender'] = df['Gender'].apply(lambda x: encode_gender(x))

i = 0
for item in df['Gender']:
  if (item == 0):
    df['Sign'][i][12] = 1
  else:
    df['Sign'][i][12] = 0
  i += 1


df = df [['Post', 'Sign']]

print('Here sign means sign + gender, last bit is for gender:')
df.head()

Here sign means sign + gender, last bit is for gender:


Unnamed: 0,Post,Sign
0,I just watched Beauty and the beast...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,This picture shows a Vietnamese ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,So I just used the term “Bad Ass...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,This is a dumb little story I whipp...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Listen, you fuckers, you screwhe...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.post = dataframe.Post
        self.targets = self.data.Sign
        self.max_len = max_len

    def __len__(self):
        return len(self.post)

    def __getitem__(self, index):
        post = str(self.post[index])
        post = " ".join(post.split())

        inputs = self.tokenizer.encode_plus(
            post,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (380720, 2)
TRAIN Dataset: (304576, 2)
TEST Dataset: (76144, 2)


In [4]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [5]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 13)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    model.train()
    for i,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        if i%5==0:
            print(f' [{i}] Loss:  {loss.item():.3f}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

for epoch in range(EPOCHS):
    train(epoch)

 [0] Loss:  0.736
 [5] Loss:  0.700
 [10] Loss:  0.671
 [15] Loss:  0.626
 [20] Loss:  0.605
 [25] Loss:  0.580
 [30] Loss:  0.572
 [35] Loss:  0.571
 [40] Loss:  0.524
 [45] Loss:  0.492
 [50] Loss:  0.465
 [55] Loss:  0.455
 [60] Loss:  0.447
 [65] Loss:  0.415
 [70] Loss:  0.394
