In [1]:
import nltk
import numpy as np
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import torch
from transformers import BertTokenizer

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train = train.iloc[:2000, ]
test = test.iloc[:1000, ]

In [3]:
nltk_tokenizer = RegexpTokenizer(r'\w+')
train_word_lengths = [len(nltk_tokenizer.tokenize(review)) for review in train.review]
test_word_lengths = [len(nltk_tokenizer.tokenize(review)) for review in test.review]

In [5]:
print(f"Min train word length: {min(train_word_lengths)}")
print(f"Max train word length: {max(train_word_lengths)}")
print(f"Average train word length: {np.mean(train_word_lengths)}")
print(f"SD train word length: {np.std(train_word_lengths)}")
print(f"Total positive reviews in train: {sum(train.sentiment)}\n")
print(f"Min test word length: {min(test_word_lengths)}")
print(f"Max test word length: {max(test_word_lengths)}")
print(f"Average test word length: {np.mean(test_word_lengths)}")
print(f"SD test word length: {np.std(test_word_lengths)}")
print(f"Total positive reviews in test: {sum(test.sentiment)}\n")

Min train word length: 18
Max train word length: 1532
Average train word length: 230.419
SD train word length: 166.7656662475823
Total positive reviews in train: 1005

Min test word length: 25
Max test word length: 1020
Average test word length: 240.918
SD test word length: 173.3413143944628
Total positive reviews in test: 541



# BERT Tokenizer

In [28]:
# Load the vanilla BERT model and tokenizer
bert_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in C:\Users\arthu/.cache\torch\hub\huggingface_pytorch-transformers_main


In [31]:
train_bert_lengths = [len(bert_tokenizer.tokenize(review)) for review in train.review]
test_bert_lengths = [len(bert_tokenizer.tokenize(review)) for review in test.review]

In [32]:
print(f"Min train token length: {min(train_bert_lengths)}")
print(f"Max train token length: {max(train_bert_lengths)}")
print(f"Average train token length: {np.mean(train_bert_lengths)}")
print(f"SD train token length: {np.std(train_bert_lengths)}")
print(f"Min test token length: {min(test_bert_lengths)}")
print(f"Max test token length: {max(test_bert_lengths)}")
print(f"Average test token length: {np.mean(test_bert_lengths)}")
print(f"SD test token length: {np.std(test_bert_lengths)}")

Min train token length: 24
Max train token length: 1797
Average train token length: 285.595
SD train token length: 208.33075139066725
Min test token length: 35
Max test token length: 1323
Average test token length: 298.738
SD test token length: 214.59573005071653


In [38]:
print(f"Train reviews longer than 512 tokens: {sum([length > 512 for length in train_bert_lengths])}")
print(f"Test reviews longer than 512 tokens: {sum([length > 512 for length in test_bert_lengths])}")

Train reviews longer than 512 tokens: 240
Test reviews longer than 512 tokens: 133


In [39]:
#Train and test lengths that get put into BERT, truncated to 512 tokens
trunc_train_bert_lengths = [512 if length > 512 else length for length in train_bert_lengths]
trunc_test_bert_lengths = [512 if length > 512 else length for length in test_bert_lengths]

In [40]:
print(f"Min train token length: {min(trunc_train_bert_lengths)}")
print(f"Max train token length: {max(trunc_train_bert_lengths)}")
print(f"Average train token length: {np.mean(trunc_train_bert_lengths)}")
print(f"SD train token length: {np.std(trunc_train_bert_lengths)}")
print(f"Min test token length: {min(trunc_test_bert_lengths)}")
print(f"Max test token length: {max(trunc_test_bert_lengths)}")
print(f"Average test token length: {np.mean(trunc_test_bert_lengths)}")
print(f"SD test token length: {np.std(trunc_test_bert_lengths)}")

Min train token length: 24
Max train token length: 512
Average train token length: 258.726
SD train token length: 136.72124898493286
Min test token length: 35
Max test token length: 512
Average test token length: 267.811
SD test token length: 139.32881711620178
