# 1 Convert data to tsv

In [1]:
import pandas as pd
from data_helpers.Data import *

In [3]:
def concat_to_doc(sent_list, sent_count):
    start_index = 0
    docs = []
    for s in sent_count:
        doc = " xxPERIOD ".join(sent_list[start_index:start_index + s])
        doc = doc + " xxPERIOD "
        docs.append(doc)
        start_index = start_index + s
    return docs

In [34]:
sent_num_file = ["train.count", "test.count"]
rating_file = ["train.rating", "test.rating"]
content_file = ["train.txt", "test.txt"]

dataset_dir = "./data/beer_100k/"

TRAIN_DATA = 0
TEST_DATA = 1

def load_data(data_index) :
    # Load Count
    sent_count = list(open(dataset_dir + sent_num_file[data_index], "r").readlines())
    sent_count = [int(s) for s in sent_count if (len(s) > 0 and s != "\n")]
    print( sent_count[0:5] )

    # Load Ratings
    aspect_rating = list(open(dataset_dir + rating_file[data_index], "r").readlines())
    aspect_rating = [s for s in aspect_rating if (len(s) > 0 and s != "\n")]

    aspect_rating = [s.split(" ") for s in aspect_rating]
    aspect_rating = np.array(aspect_rating)[:, :]
    aspect_rating = aspect_rating.astype(np.float) - 1
    aspect_rating = np.rint(aspect_rating).astype(int)  # ROUND TO INTEGER =================
    aspect_rating = pd.DataFrame(aspect_rating)
    print( aspect_rating.head() )

    # Load Sents
    sents = list(open(dataset_dir + content_file[data_index], "r").readlines())
    sents = [s.strip() for s in sents]
    sents = [s[:-1] for s in sents if s.endswith(".")]
    print( sents[0:5] )
    
    return sent_count, aspect_rating, sents

## 1.1 Training Data

In [35]:
sent_count_train, aspect_rating_train, sents_train = load_data(TRAIN_DATA)

[8, 7, 9, 14, 12]
   0  1  2  3  4
0  0  2  0  0  1
1  2  2  2  2  2
2  2  2  2  2  2
3  2  2  2  2  2
4  2  2  3  3  2
['A lot of foam', 'But a lot', 'In the smell some banana, and then lactic and tart', 'Not a good start', 'Quite dark orange in color, with a lively carbonation (now visible, under the foam)']


In [36]:
# Concate sentences to doc
docs_train = concat_to_doc(sents_train, sent_count_train)
docs_train = pd.DataFrame(docs_train)
docs_train.head()

Unnamed: 0,0
0,A lot of foam xxPERIOD But a lot xxPERIOD In t...
1,"Dark red color, light beige foam, average xxPE..."
2,"Almost totally black xxPERIOD Beige foam, quit..."
3,"Golden yellow color xxPERIOD White, compact fo..."
4,"22 oz bottle from ""Lifesource"" Salem xxPERIOD ..."


In [39]:
len(docs_train)

75116

In [24]:
df_train = pd.DataFrame( {
    'id': list(range(len(aspect_rating_train))),
    'label': aspect_rating_train[0],
    'alpha': ['a']*len(aspect_rating_train),
    'text': docs_train[0]
    })
df_train.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,A lot of foam xxPERIOD But a lot xxPERIOD In t...
1,1,2,a,"Dark red color, light beige foam, average xxPE..."
2,2,2,a,"Almost totally black xxPERIOD Beige foam, quit..."
3,3,2,a,"Golden yellow color xxPERIOD White, compact fo..."
4,4,2,a,"22 oz bottle from ""Lifesource"" Salem xxPERIOD ..."


In [29]:
# remove all the negative ratings!!
df_train = df_train[df_train["label"] >= 0]
df_train.shape

(75113, 4)

In [30]:
np.random.seed(42)
msk = np.random.rand(len(df_train)) < 0.85
df_train08 = df_train[msk]
df_valid02 = df_train[~msk]
len(df_train08), len(df_valid02)

(63909, 11204)

In [31]:
df_train08.to_csv('data/transformer_beer_train.tsv', sep='\t', index=False, header=False)
df_valid02.to_csv('data/transformer_beer_valid.tsv', sep='\t', index=False, header=False)

## 1.2 Test Data

In [37]:
sent_count_test, aspect_rating_test, sents_test = load_data(TEST_DATA)

[13, 12, 7, 9, 6]
   0  1  2  3  4
0  3  3  4  3  4
1  2  2  2  2  2
2  3  3  3  2  3
3  3  2  2  2  4
4  2  2  2  2  2
['According to the website, the style for the Caldera Cauldron changes every year', "The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back)", 'In any event', 'at the Horse Brass yesterday', 'The beer pours an orange copper color with good head retention and lacing']


In [38]:
# Concate sentences to doc
docs_test = concat_to_doc(sents_test, sent_count_test)
docs_test = pd.DataFrame(docs_test)
docs_test.head()

Unnamed: 0,0
0,"According to the website, the style for the Ca..."
1,Poured from the bottle into a Chimay goblet xx...
2,Notes from 6/24 xxPERIOD A: Bright golden glow...
3,"22 oz xxPERIOD bomber, xxPERIOD A: Pours a cle..."
4,"Brown in color, somewhere between a porter and..."


In [40]:
len(docs_test)

24884

In [41]:
df_test = pd.DataFrame( {
    'id': list(range(len(aspect_rating_test))),
    'label': aspect_rating_test[0],
    'alpha': ['a']*len(aspect_rating_test),
    'text': docs_test[0]
    })
df_test.head()

Unnamed: 0,id,label,alpha,text
0,0,3,a,"According to the website, the style for the Ca..."
1,1,2,a,Poured from the bottle into a Chimay goblet xx...
2,2,3,a,Notes from 6/24 xxPERIOD A: Bright golden glow...
3,3,3,a,"22 oz xxPERIOD bomber, xxPERIOD A: Pours a cle..."
4,4,2,a,"Brown in color, somewhere between a porter and..."


In [42]:
len(df_test)

24884

In [44]:
# ACTUALLY NO DIFFERENCE
df_test = df_test[df_test["label"] >= 0]
df_test.shape

(24884, 4)

In [45]:
df_test.to_csv('data/transformer_beer_test.tsv', sep='\t', index=False, header=False)

# 2 Build Data and Train

In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction

In [3]:
from transformers.data.processors.utils import *

In [4]:
from transformers import Trainer, TrainingArguments

In [5]:
from typing import Dict, Optional

In [6]:
import torch

torch.cuda.is_available()

True

## 2.1 Try to load dataset and convert to features

In [7]:
train_ds = SingleSentenceClassificationProcessor.create_from_csv(file_name="./data/transformer_beer_train.tsv", split_name="train", column_id=0, column_label=1, column_text=3)
valid_ds = SingleSentenceClassificationProcessor.create_from_csv(file_name="./data/transformer_beer_valid.tsv", split_name="train", column_id=0, column_label=1, column_text=3)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7f92f7dd5a00>

In [9]:
cache_name = "cached_{}_{}_{}_{}".format(
    "train",
    tokenizer.__class__.__name__,
    str(512),
    "beer",
)
cache_name

'cached_train_BertTokenizer_512_beer'

In [17]:
feat_train = train_ds.get_features(tokenizer)
torch.save(feat_train, cache_name)

In [18]:
cache_name = "cached_{}_{}_{}_{}".format(
    "valid",
    tokenizer.__class__.__name__,
    str(512),
    "beer",
)
cache_name

'cached_valid_BertTokenizer_512_beer'

In [20]:
feat_valid = valid_ds.get_features(tokenizer)
torch.save(feat_valid, cache_name)

## 2.1 (alter) load cached features

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7f939866c220>

In [11]:
cache_name = "cached_{}_{}_{}_{}".format(
    "train",
    tokenizer.__class__.__name__,
    str(512),
    "beer",
)
feat_train = torch.load(cache_name)

In [12]:
cache_name = "cached_{}_{}_{}_{}".format(
    "valid",
    tokenizer.__class__.__name__,
    str(512),
    "beer",
)
feat_valid = torch.load(cache_name)

## 2.2 Build model and train

In [13]:
config = AutoConfig.from_pretrained("bert-base-cased", num_labels=5)
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", config=config)
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
def simple_accuracy(preds, labels):
        return (preds == labels).mean()

def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": simple_accuracy( preds, p.label_ids ) }

In [16]:
train_args = TrainingArguments(
    output_dir = "./trfm_out/BEERTEST/",
    do_train = True,
    do_eval = True,
    evaluate_during_training = True,
    
    per_gpu_train_batch_size = 16,
    per_gpu_eval_batch_size = 16,
    
    learning_rate = 2e-5,
    num_train_epochs = 5,
    
    logging_dir = "./trfm_out/BEERTEST/tblog/",
    logging_steps = 1000
)

# train_args.device = torch.device("cuda:0")
train_args.device

device(type='cuda')

In [17]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=feat_train,
    eval_dataset=feat_valid,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3995.0, style=ProgressStyle(description_w…



{"loss": 1.0045849207639694, "learning_rate": 1.8998748435544432e-05, "epoch": 0.2503128911138924, "step": 1000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.9008339099299041, "eval_acc": 0.5977329525169582, "epoch": 0.2503128911138924, "step": 1000}
{"loss": 0.8908224972188473, "learning_rate": 1.7997496871088863e-05, "epoch": 0.5006257822277848, "step": 2000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8753023909008962, "eval_acc": 0.6136201356658336, "epoch": 0.5006257822277848, "step": 2000}
{"loss": 0.8714881924986839, "learning_rate": 1.699624530663329e-05, "epoch": 0.7509386733416771, "step": 3000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8983432566388357, "eval_acc": 0.5983577293823634, "epoch": 0.7509386733416771, "step": 3000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3995.0, style=ProgressStyle(description_w…

{"loss": 0.8594616976976395, "learning_rate": 1.599499374217772e-05, "epoch": 1.0012515644555695, "step": 4000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8542980836138405, "eval_acc": 0.6167440199928597, "epoch": 1.0012515644555695, "step": 4000}
{"loss": 0.7734845105707645, "learning_rate": 1.4993742177722154e-05, "epoch": 1.2515644555694618, "step": 5000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8591692642376528, "eval_acc": 0.6143341663691538, "epoch": 1.2515644555694618, "step": 5000}
{"loss": 0.7828256739079952, "learning_rate": 1.3992490613266585e-05, "epoch": 1.5018773466833542, "step": 6000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8433297744151019, "eval_acc": 0.620492681185291, "epoch": 1.5018773466833542, "step": 6000}
{"loss": 0.7757697829902173, "learning_rate": 1.2991239048811016e-05, "epoch": 1.7521902377972465, "step": 7000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8553995977709875, "eval_acc": 0.61745805069618, "epoch": 1.7521902377972465, "step": 7000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3995.0, style=ProgressStyle(description_w…

{"loss": 0.7792644290328026, "learning_rate": 1.1989987484355445e-05, "epoch": 2.002503128911139, "step": 8000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8407222926191529, "eval_acc": 0.6240628347018922, "epoch": 2.002503128911139, "step": 8000}
{"loss": 0.650169583261013, "learning_rate": 1.0988735919899876e-05, "epoch": 2.252816020025031, "step": 9000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.9230038125742859, "eval_acc": 0.6102284898250625, "epoch": 2.252816020025031, "step": 9000}
{"loss": 0.6617942819148303, "learning_rate": 9.987484355444305e-06, "epoch": 2.5031289111389237, "step": 10000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.8997887106195156, "eval_acc": 0.6154944662620493, "epoch": 2.5031289111389237, "step": 10000}
{"loss": 0.6565092722475528, "learning_rate": 8.986232790988738e-06, "epoch": 2.7534418022528158, "step": 11000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.9043093104760419, "eval_acc": 0.6140664048554088, "epoch": 2.7534418022528158, "step": 11000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3995.0, style=ProgressStyle(description_w…

{"loss": 0.6523595194965601, "learning_rate": 7.984981226533167e-06, "epoch": 3.0037546933667083, "step": 12000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 0.9123817904401947, "eval_acc": 0.6150481970724742, "epoch": 3.0037546933667083, "step": 12000}
{"loss": 0.5321464370787143, "learning_rate": 6.9837296620775975e-06, "epoch": 3.254067584480601, "step": 13000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.0102684590088657, "eval_acc": 0.605855051767226, "epoch": 3.254067584480601, "step": 13000}
{"loss": 0.5229592728167772, "learning_rate": 5.9824780976220275e-06, "epoch": 3.504380475594493, "step": 14000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.0599606648066244, "eval_acc": 0.6038914673330953, "epoch": 3.504380475594493, "step": 14000}
{"loss": 0.515512550547719, "learning_rate": 4.981226533166458e-06, "epoch": 3.7546933667083855, "step": 15000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.030276155590841, "eval_acc": 0.6026419136022849, "epoch": 3.7546933667083855, "step": 15000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3995.0, style=ProgressStyle(description_w…

{"loss": 0.5199227917194367, "learning_rate": 3.979974968710889e-06, "epoch": 4.005006257822278, "step": 16000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.0473916970416584, "eval_acc": 0.6079078900392717, "epoch": 4.005006257822278, "step": 16000}
{"loss": 0.41437500531971455, "learning_rate": 2.978723404255319e-06, "epoch": 4.25531914893617, "step": 17000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.1898709144682416, "eval_acc": 0.6038022134951803, "epoch": 4.25531914893617, "step": 17000}
{"loss": 0.42155947017669676, "learning_rate": 1.9774718397997496e-06, "epoch": 4.505632040050062, "step": 18000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.1601220290262928, "eval_acc": 0.6034451981435202, "epoch": 4.505632040050062, "step": 18000}
{"loss": 0.4171086375564337, "learning_rate": 9.762202753441802e-07, "epoch": 4.755944931163955, "step": 19000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=701.0, style=ProgressStyle(description_w…


{"eval_loss": 1.172910152994277, "eval_acc": 0.6011245983577294, "epoch": 4.755944931163955, "step": 19000}




TrainOutput(global_step=19975, training_loss=0.6555593380753478)