In [0]:
!pip install pytorch_pretrained_bert

# Data Analysis


In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [0]:
url = "https://raw.githubusercontent.com/avishreekh/Depression-prediction/master/data.csv"
df = pd.read_csv(url)
print(len(df))
train_raw_df, test_raw_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
train_raw_df = train_raw_df.reset_index(drop=True)
test_raw_df = test_raw_df.reset_index(drop=True)
print(len(train_raw_df))
print(len(test_raw_df))

In [0]:
train_raw_df.head()

In [0]:
train_raw_df.describe()

#Data Cleaning

In [0]:
def remove_pattern(ip, pattern):
  reg_ex = re.findall(pattern, str(ip))
  for exp in reg_ex:
    ip = re.sub(exp, '', str(ip))
  return str(ip)

In [0]:
def clean_data(df):
  df['text'] = np.vectorize(remove_pattern)(df['text'], '@[\w]*')
  df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
  df['text'] = df['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [0]:
clean_data(train_raw_df)
clean_data(test_raw_df)
train_raw_df.head()

In [0]:
train_df = pd.DataFrame({'id': range(len(train_raw_df)),
                         'label': train_raw_df['label'],
                         'alpha': ['a']*train_raw_df.shape[0],
                         'text': train_raw_df['text'].replace(r'\n', ' ', regex=True)})
test_df = pd.DataFrame({'id': range(len(test_raw_df)),
                         'label': test_raw_df['label'],
                         'alpha': ['a']*test_raw_df.shape[0],
                         'text': test_raw_df['text'].replace(r'\n', ' ', regex=True)})
train_df.head()

In [0]:
train_df.to_csv('/content/data/train.tsv', sep='\t', index=False, header=False)
test_df.to_csv('/content/data/dev.tsv', sep='\t', index=False, header=False)

#Data preprocessing

In [0]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
import csv 
import sys
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

In [0]:
class InputExample(object):
    def __init__(self, guid, text, label=None):
        self.guid = guid
        self.text = text
        self.label = label


class DataProcessor(object):
    def get_train_examples(self, data_dir):
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        raise NotImplementedError()

    def get_labels(self):
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryClassificationProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text=text, label=label))
        return examples


In [0]:
class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens, max_length):
    while True:
        if len(tokens) <= max_length:
            break
        else:
            tokens.pop()


def convert_example_to_feature(example_row):
    example, label_map, max_seq_length, tokenizer = example_row

    tokens = tokenizer.tokenize(example.text)        
    
    if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]
    
    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)

In [0]:
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples('.')
train_examples_len = len(train_examples)

In [0]:
label_list = processor.get_labels()
num_labels = len(label_list)

In [0]:
MAX_SEQ_LENGTH = 128

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer) for example in train_examples]

In [0]:
process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

In [0]:
with open("train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

# Creating the model

In [0]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir="/content/cache", num_labels=num_labels)

In [0]:
model.to(device)

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [0]:
TRAIN_BATCH_SIZE = 256
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1

num_train_optimization_steps = int(train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=num_train_optimization_steps)

In [0]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [0]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

In [0]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

#Training the model

In [0]:
model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        
        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r Loss: %f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

In [0]:
model_to_save = model.module if hasattr(model, 'module') else model

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

output_model_file = os.path.join('./output', WEIGHTS_NAME)
output_config_file = os.path.join('./output', CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary('./output')