In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
imdb = load_from_disk("../data/imdb")

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100
learning_rate=2e-05
num_train_epochs=5
output_dir = "../output/"
model_dir = "../models/"
early_stopping_patience = 10

In [5]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length )


tokenized_datasets = imdb.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

In [6]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", ])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    dev: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24000
    })
})

In [8]:
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
model_finetuned = BertForSequenceClassification.from_pretrained("artemis13fowl/bert-base-uncased-imdb")

In [10]:
model_finetuned

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import numpy as np

In [12]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [13]:
training_args = TrainingArguments(
   output_dir+"bert-base-uncased-imdb",
   evaluation_strategy ='steps',
   eval_steps = eval_steps , # Evaluation and Save happens every eval_steps steps
   save_total_limit = 1, # Only last  model is saved. Older ones are deleted.
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=num_train_epochs,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True)

In [14]:
trainer_eval = Trainer(
    model_finetuned,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [15]:
tokenized_datasets["test"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24000
})

In [16]:
predictions = trainer_eval.predict(tokenized_datasets["test"].shuffle().select(range(1000)))
print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(predictions.label_ids, preds))

(1000, 2) (1000,)
              precision    recall  f1-score   support

           0       0.91      0.86      0.89       511
           1       0.86      0.92      0.89       489

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000



In [None]:
from huggingface_hub import create_repo
create_repo("bert-base-uncased-imdb")

In [None]:
from huggingface_hub import upload_file

upload_file(
    model_dir+"bert-base-uncased-imdb"+"/config.json",
    path_in_repo="config.json",
    repo_id="artemis13fowl/bert-base-uncased-imdb",
)

In [None]:
from huggingface_hub import Repository

repo = Repository("huggingface_repo1", clone_from="artemis13fowl/bert-base-uncased-imdb")

In [None]:
repo.git_pull()

In [None]:
model_finetuned.save_pretrained("huggingface_repo1")

In [None]:
repo.git_add()
repo.git_commit("Add bert-base-uncased-imdb")
repo.git_push()

### Common sense eval

In [112]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [116]:
tokenizer("Hello here i am wherre are you")

{'input_ids': [101, 7592, 2182, 1045, 2572, 1059, 5886, 2890, 2024, 2017, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [117]:
tokenizer("Hello here i am wherre are you",truncation=True, padding="max_length", max_length=3 )

{'input_ids': [101, 7592, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [123]:
tokenizer.encode("Hello here i am wherre are you")

[101, 7592, 2182, 1045, 2572, 1059, 5886, 2890, 2024, 2017, 102]

In [124]:
tokenizer.encode_plus("Hello here i am wherre are you")

{'input_ids': [101, 7592, 2182, 1045, 2572, 1059, 5886, 2890, 2024, 2017, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [125]:
tokenizer.decode([101, 7592, 2182, 1045, 2572, 1059, 5886, 2890, 2024, 2017, 102])

'[CLS] hello here i am wherre are you [SEP]'

In [113]:
from transformers import pipeline
pipe = pipeline("sentiment-analysis", model=model_finetuned, tokenizer=tokenizer)

In [114]:
sample = imdb['test'].shuffle().select(range(5))

In [115]:

for i in range(5):
    print('-'*20)
    text = sample[i]['text']
    label = sample[i]['label']
    print(label, text )
    print(len(text), len(text.split()))
    text_trunc = " ".join(text.split()[:200])
    print(pipe(text_trunc))

--------------------
0 I'm a big fan of the "Vacation" franchise, and I love Randy Quaid as Cousin Eddie, and at least a couple of the behind-the-scenes names were involved in this project (most notably Matty Simmons, who produced or executive-produced all 4 of the theatrical releases, as well as "Animal House"). For those reasons I figured this made-for-TV spin off might be worth checking out, even without Chevy Chase.<br /><br />For the record, I did not expect it to be very good; I just thought it might be a slightly amusing diversion. Therefore, my high level of disappointment goes to prove just how bad this utter turd of a movie really was. It was mind-numbingly, jaw-droppingly, heart-stoppingly, head-explodingly terrible. Yet, somehow, I could not stop watching it. It's a sickness I have; I can't seem to walk out on a film or give up on a TV show before it ends. Nothing has ever made me want two hours of my life back more than this movie.
935 165
[{'label': 'LABEL_0', 'score': 0.

In [102]:
pipe("Latest attempt to revive the series actually based on a pretty good idea but without the required gore fx/violence for this type of thriller - and thus... BORING!! Good special fx, sets, costumes, etc. but the film comes of just plain silly and a near-waste of time... hopefully the next installment will correct this problem.")

[{'label': 'LABEL_0', 'score': 0.9981714487075806}]

In [103]:
#waste -> easte
pipe("Latest attempt to revive the series actually based on a pretty good idea but without the required gore fx/violence for this type of thriller - and thus... BORING!! Good special fx, sets, costumes, etc. but the film comes of just plain sally and a near-easte of time... hopefully the next installment will correct this problem.")

[{'label': 'LABEL_1', 'score': 0.9867995381355286}]

In [92]:
sent = """Oh If any day u wanna see a supernatural thriller turning out to be a comedy watch this movie<br /><br />
This film was a shocker as it had so many actors in it but what they do and how they fit in?<br /><br />
The handling of the college scenes is like a school play where each person comes talk and then the next person comes infront<br /><br />
Okay reasons to laugh at the film: 1) Akshay, Suneil, Aditya Panscholi, Sharad Kapoor, Arshad Warsi as college students 
2)Akshay carries a gun in college 3) some pathetic stunts and SFX<br /><br />
there are several more flaws like why doesn't the snake save his lover from being raped and comes in so late? 
also why he doesn't kill all of them together there only?<br /><br />But afterall they have to make a 2 hrs + film 
so hence you have a tortorius movie<br /><br />The movie is painful to watch The film was directed by Rajkumar Kohli who was an expert 
making such films in the past and had a successful record of films like JAANI DUSHMAN(1979) and NAGIN 
Rajkumar Kohli wants to help his son's non existent career Right from VIRODHI(1992),Aulad Ke Dushman (1993) and QAHAR(1996) 
all flops he tried hard to promote his son and he also casts big stars so that his son gets noticed, sadly nothing could help his son's career
<br /><br />The film has several comical scenes like the death scenes, how the actors after being bashed by the snake are so fit to fight him 
again and the climax<br /><br />Direction by Rajkumar Kohli is bad Music is bad<br /><br />That brings us to the cast 
Akshay Kumar - ordinary stuff, he has nothing much to do rather then stunts Suneil Shetty- awful Sonu Nigam- the worst debutante award goes 
to him, he gives cartoon acting a new meaning Aftab- terrible Arshad Warsi- nothing to do Sharad Kapoor- bad Aditya Panscholi- irritates 
Sunny Deol- is comical in the scene when he comes to save Sonu LOL Manisha Koirala- ordinary Rambha- Akshay's pair Kiran Kumar, 
Raza Murad are as usual Raj Babbar- hilarious for wrong reasons the girls are awful Which brings us to Munish Kohli 
This guy has a huge physique, he is even more stronger and taller then Akshay Kumar Sadly he comes across as poor man's Akshay 
His voice is awful, his expressions are painful The only thing he has to do in the movie is wear glasses and make an evil face 
Rajat Bedi is awful
"""

sent[:1500]

"Oh If any day u wanna see a supernatural thriller turning out to be a comedy watch this movie<br /><br />\nThis film was a shocker as it had so many actors in it but what they do and how they fit in?<br /><br />\nThe handling of the college scenes is like a school play where each person comes talk and then the next person comes infront<br /><br />\nOkay reasons to laugh at the film: 1) Akshay, Suneil, Aditya Panscholi, Sharad Kapoor, Arshad Warsi as college students \n2)Akshay carries a gun in college 3) some pathetic stunts and SFX<br /><br />\nthere are several more flaws like why doesn't the snake save his lover from being raped and comes in so late? \nalso why he doesn't kill all of them together there only?<br /><br />But afterall they have to make a 2 hrs + film \nso hence you have a tortorius movie<br /><br />The movie is painful to watch The film was directed by Rajkumar Kohli who was an expert \nmaking such films in the past and had a successful record of films like JAANI DUS

In [94]:
imdb['train'].set_format("pandas")

In [96]:
imdb['train'][:5]

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
