<a href="https://colab.research.google.com/github/ashutoshbaghel/emnlp17-depression/blob/BERT/BERT_for_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get more RAM, we'll need atleast 25GB :

In [0]:

# a = []
# while(1):
#     a.append('1')

## Get BERT Libs

In [0]:
%%bash
pip install tqdm boto3 requests regex sentencepiece sacremoses tokenizers==0.5.2 transformers

Collecting sentencepiece
  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
Collecting sacremoses
  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
Collecting tokenizers==0.5.2
  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
Collecting transformers
  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sac

## Code for Mental Health

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
! mkdir -p rsdd_posts
! cp -r '/gdrive/My Drive/MentalHealthData/data/RSDD/training.gz' rsdd_posts/

In [0]:
# # Use full validataion:
# ! cp -r '/gdrive/My Drive/MentalHealthData/data/RSDD/validation.gz' rsdd_posts/
##-------------------------------------##
## OR Use a small split:
# Copy a split validation file and zip it

! cp -r '/gdrive/My Drive/MentalHealthData/valid_split_10000/validation_00' rsdd_posts/
! gzip rsdd_posts/validation_00 

! mv rsdd_posts/validation_00.gz rsdd_posts/validation.gz 

In [0]:
import gzip
import json
import time
import os
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, DistilBertModel, DistilBertTokenizer

class mentalhealth_dataset(Dataset):
    def __init__(self,fname, max_posts=400, max_len=100, randomize=False):
        
        start = time.time()

        self.fname = fname
        self.randomize = randomize
        # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.max_posts = max_posts
        self.max_len = max_len

        ## Load data
        print("loading %s posts" % fname)
        f = gzip.open(fname, 'rt')
        labels = {}
        allposts = {}
        ids = []
        for i, line in enumerate(f):
            user = str(i)
            d = json.loads(line)[0]
            if d['label'] == 'control':
                labels[user] = 0 #np.array([1, 0], dtype=np.float32)
            elif d['label'] == 'depression':
                labels[user] = 1 #np.array([0, 1], dtype=np.float32)
            elif d['label'] is None:
                continue
            else:
                raise RuntimeError("unknown label: %s" % d['label'])
            allposts[user] = [post for dt, post in d['posts']]
            ids.append(user)
        f.close()

        total = (time.time() - start)/60

        print("loaded %s posts" % fname)
        print("Time taken: %s mins" % total)
        ## Assign data to self
        self.x = allposts
        self.y = labels
        self.ids = ids
        
    def __getitem__(self,index):
        index = self.ids[index]
        uposts = self.x[index]
        
        if self.randomize:
            idxs = np.random.permutation(min(self.max_posts, len(uposts)))
            chosen = [uposts[idx] for idx in idxs]
        else:
            chosen = uposts[:self.max_posts]
        
        # This will not ensure that each post is of max_len. It can be smaller if all len of posts for that user is less than 100.
        # Would this have a problem for the Model? Yes
        # Handled now.
        # x0 = [torch.tensor(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(post))[:self.max_len]) for post in chosen]
        x0 = [torch.tensor(self.tokenizer.encode(post, add_special_tokens=True, max_len=self.max_len)[:self.max_len]) for post in chosen]
        # if removes empty posts if selected below:
        x_padded = torch.stack([torch.cat((x1,torch.zeros(self.max_len-x1.size(0), dtype = torch.int64))) for x1 in x0 if x1.size(0) != 0 ]) 
        # x_padded = pad_sequence(x, batch_first=True, padding_value=0)

        # make sure x_padded has dim = max_posts * (max len of posts)
        x_padded = torch.cat((x_padded, torch.zeros((self.max_posts-x_padded.size(0), x_padded.size(1)), dtype = torch.int64)))
        attention_mask = np.where(x_padded != 0, 1, 0)

        return x_padded, torch.tensor(attention_mask), torch.tensor(self.y[index]).long()
        
    def __len__(self):
        return len(self.ids)

In [0]:
train_dataset = mentalhealth_dataset("/content/rsdd_posts/training.gz", randomize=True)
val_dataset = mentalhealth_dataset("/content/rsdd_posts/validation.gz", randomize=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…


loading /content/rsdd_posts/training.gz posts
loaded /content/rsdd_posts/training.gz posts
Time taken: 2.7372454047203063 mins
loading /content/rsdd_posts/validation.gz posts
loaded /content/rsdd_posts/validation.gz posts
Time taken: 1.0456780314445495 mins


In [0]:
# train_dataset[0]

Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


(tensor([[ 101, 2002, 1005,  ...,    0,    0,    0],
         [ 101, 2008, 1005,  ...,    0,    0,    0],
         [ 101, 2033, 1998,  ...,    0,    0,    0],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor(0))

In [0]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0)

val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=0)

In [0]:
## Simple FC:
class FeedForward(nn.Module):
    def __init__(self, num_labels=2):
        super(FeedForward, self).__init__()
        self.num_labels = num_labels
        # self.bert = BertModel.from_pretrained('bert-base-uncased')
        # self.pool = nn.MaxPool2d(kernel_size=(config.hidden_size,1), stride=1)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input):
        # batch_size = input_ids.size(0)
        # input_ids = input_ids.view(-1, 100)
        # _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        # pooled_output = self.pool(pooled_output)
        # pooled_output = pooled_output.view(batch_size, -1)
        # pooled_output = self.dropout(pooled_output)
        logits = self.classifier(input)
        return logits

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_labels = 2
ffmodel = FeedForward(num_labels)
ffmodel.to(device)

bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model.to(device)


dataloaders_dict = {'train': train_loader,
                   'val':val_loader
                   }
dataset_sizes = {'train':len(train_dataset),
                'val':len(val_dataset)
                }

HBox(children=(IntProgress(value=0, description='Downloading', max=442, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=267967963, style=ProgressStyle(description_…




In [0]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    print('starting')
    # best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':                
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            sentiment_corrects = 0
            
            
            # Iterate over data.
            for input_ids, mask, sentiment in dataloaders_dict[phase]:
                print(".", end="")
                input_ids = input_ids.to(device) 
                mask = mask.to(device) 
                sentiment = sentiment.to(device)

                batch_size = input_ids.size(0)
                input_ids = input_ids.view(-1, input_ids.size(-1))
                with torch.no_grad():
                    last_hidden_states = bert_model(input_ids, attention_mask=mask)
                    output = last_hidden_states[0][:,0,:]
                    output = output.view(batch_size, -1, output.size(-1))
                    averaged_output = torch.mean(output, dim=1)

                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    #print(inputs)
                    # averaged_output= averaged_output.to(device)
                    outputs = model(averaged_output)

                    outputs = F.softmax(outputs,dim=1)
                    
                    loss = criterion(outputs, sentiment)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * input_ids.size(0)

                
                sentiment_corrects += torch.sum(torch.max(outputs, 1)[1] == sentiment)

                
            epoch_loss = running_loss / dataset_sizes[phase]

            
            sentiment_acc = sentiment_corrects.double() / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} sentiment_acc: {:.4f}'.format(
                phase, sentiment_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')


        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
lrlast = .001
lrmain = .00001
optim1 = optim.Adam(ffmodel.parameters())

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

In [0]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)  # No warning on sample size
        

In [0]:
model_ft1 = train_model(ffmodel, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=10)

starting
Epoch 0/9
----------
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [0]:
!/opt/bin/nvidia-smi

Wed May  6 23:48:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    74W / 149W |  10980MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
!ps -aux|grep python

root          24  0.1  0.3 486240 83580 ?        Sl   21:02   0:16 /usr/bin/python2 /usr/local/bin/jupyter-notebook --ip="172.28.0.2" --port=9000 --FileContentsManager.root_dir="/" --MappingKernelManager.root_dir="/content"
root        3641 17.6 49.2 66137004 13176632 ?   Ssl  23:26   3:59 /usr/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-216d34b4-5d05-422f-b714-7064d798e0c3.json
root        3817  0.0  0.0  39192  6380 ?        S    23:48   0:00 /bin/bash -c ps -aux|grep python
root        3819  0.0  0.0  38568  4892 ?        S    23:48   0:00 grep python


In [0]:
!kill -9 3641