In [None]:
from transformers import DistilBertForSequenceClassification,BertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import pipeline
import transformers
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# This cell purely for checking how much memory we have available

t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved

print(f"total mem: {t}\nreserved: {r}\nallocated: {a}\nfree in reserve: {f}")


In [None]:
# Confirm that torch is working and it found a device

torch.cuda.current_device()


In [None]:
# Load data

negative_df = pd.read_csv("data/reviews.csv")[['text', 'negative']]

#negative_df = negative_df.rename(columns={"Neg":"labels"})
negative_df = negative_df.rename(columns={"negative":"labels"})
# Quick hack to convert to int
negative_df['labels'] = negative_df.labels.astype(int)

# filter by length
negative_df = negative_df[negative_df.text.map(len) >= 12]

# Balance the dataset
#Not balancing for now, trying to improve class balance issues.
#Back to balancing
#negative_df = negative_df[negative_df.labels == 1].append(negative_df[negative_df.labels == 0].sample(negative_df.labels.sum()*9))



In [None]:
# Do train test split and rebalance

from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split

sentiment_dataset_train, sentiment_dataset_test = train_test_split(negative_df[['text','labels']], test_size=.0125)

# write to CSV for reference
sentiment_dataset_train.to_pickle("data/transformers_test_data/train.pkl")
sentiment_dataset_test.to_pickle("data/transformers_test_data/test.pkl")

# sub sample sentiment TRAINING set, level test set at the same ratio
sentiment_dataset_train = sentiment_dataset_train[sentiment_dataset_train.labels == 1].append(sentiment_dataset_train[sentiment_dataset_train.labels == 0].sample(sentiment_dataset_train.labels.sum()*9))

sentiment_dataset_train = Dataset.from_pandas(sentiment_dataset_train[['labels','text']])
sentiment_dataset_test = Dataset.from_pandas(sentiment_dataset_test[['labels', 'text']])


In [None]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize with appropriate tokenizer etc.

def tokenize(batch):
    return tokenizer(batch['text'],
                     padding='max_length',
                     truncation=True,
                     max_length=128,
                    )

sentiment_dataset_train = sentiment_dataset_train.map(tokenize,
                                                      batched=True,
                                                      batch_size=32)
                                                      #batch_size=len(sentiment_dataset_train))
sentiment_dataset_test = sentiment_dataset_test.map(tokenize,
                                                    batched=True,
                                                    batch_size=32)
                                                    #batch_size=len(sentiment_dataset_train))

sentiment_dataset_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
sentiment_dataset_test.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit = 5,
    evaluation_strategy='steps',
    eval_steps=500,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=sentiment_dataset_train,
    eval_dataset=sentiment_dataset_test
)

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved

print(f"total mem: {t}\nreserved: {r}\nallocated: {a}\nfree in reserve: {f}")


In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model = BertForSequenceClassification.from_pretrained("./results/checkpoint-1500")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
sentiment_classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
testcase = "This is not working. I'm waiting on it for days now"
sentiment_classifier(testcase)