In [2]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from heapq import *
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

from BertExtractors import *

%load_ext autoreload
%autoreload 2

In [7]:
#Strip out HTML
#Thanks to https://www.kaggle.com/code/rafaeltiedra/step-by-step-imdb-sentiment-analysis
'''
def process(x):
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    return x.strip()

df = pd.read_csv("../data/IMDBDataset.csv")
df['sentiment'] = df['sentiment'] == 'positive'
df['sentiment'] = df['sentiment'].astype('int')
df['review'] = df['review'].apply(lambda x: process(x))
df.to_csv("../data/IMDBDataset_CleanHTML.csv", index = False)
'''
df = pd.read_csv("../data/IMDBDataset_CleanHTML.csv")
train = df.iloc[:2000,]
test = df.iloc[25000:26000,]
test = test.reset_index(drop = True)

In [3]:
# Load the vanilla BERT model and tokenizer
model_bert = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in C:\Users\arthu/.cache\torch\hub\huggingface_pytorch-transformers_main
Using cache found in C:\Users\arthu/.cache\torch\hub\huggingface_pytorch-transformers_main


# Contiguous 150 Extractor

In [113]:
extractor = BertContiguousKExtractor(model_bert, tokenizer, 150)
#The feature representations produced by BERT from the rationales
#Extract from the test
output = []
for i in range(0, 1000):
    if i % 50 == 0:
        print(f"Location: {i}")
    test_ids = extractor.extract(test.iloc[i].review)
    with torch.no_grad():
        output.append(model_bert(test_ids).pooler_output)
#DFs containing the 768 extracted features
test_feat_df = pd.DataFrame(torch.cat(output, dim=0))
#Add the sentiment
test_feat_df = pd.concat([test_feat_df, test['sentiment']],
                          axis = 1)
test_feat_df.to_csv("../data/contiguous150_test_1000.csv", index = False)
print("Files saved successfully")

Location: 0
Location: 50
Location: 100
Location: 150
Location: 200
Location: 250
Location: 300
Location: 350
Location: 400
Location: 450
Location: 500
Location: 550
Location: 600
Location: 650
Location: 700
Location: 750
Location: 800
Location: 850
Location: 900
Location: 950
Files saved successfully


## Extract Rationales

In [15]:
extractor = BertContiguousKExtractor(model_bert, tokenizer, 150)
#The feature representations produced by BERT from the rationales
#Extract from the test
output = []
for i in range(0, 5):
    if i % 50 == 0:
        print(f"Location: {i}")
    train_ids = extractor.extract(train.iloc[i].review)
    output.append(extractor.input_ids_to_text(train_ids))

#DFs containing the rationales
rationale_df = pd.DataFrame(output)
#Add the sentiment
rationale_df = pd.concat([rationale_df, train.iloc[:5]['sentiment']],
                          axis = 1)
rationale_df.to_csv("../data/text_rationales/contiguous150_train_0005.csv", index = False)
print("Files saved successfully")

Location: 0
Files saved successfully


# Contiguous 50 Extractor

In [17]:
extractor = BertContiguousKExtractor(model_bert, tokenizer, 50)
#The feature representations produced by BERT from the rationales
output = []
for i in range(0, 1000):
    if i % 50 == 0:
        print(f"Location: {i}")
    input_ids = extractor.extract(test.iloc[i].review)
    with torch.no_grad():
        output.append(model_bert(input_ids).pooler_output)
#DFs containing the 768 extracted features
output_df = pd.DataFrame(torch.cat(output, dim=0))
#Add the sentiment
output_df = pd.concat([output_df, test['sentiment']],
                          axis = 1)
output_df.to_csv("../data/contiguous050_test_1000.csv", index = False)
print("Files saved successfully")

Location: 0
Location: 50
Location: 100
Location: 150
Location: 200
Location: 250
Location: 300
Location: 350
Location: 400
Location: 450
Location: 500
Location: 550
Location: 600
Location: 650
Location: 700
Location: 750
Location: 800
Location: 850
Location: 900
Location: 950
Files saved successfully


## Extract Rationales

In [10]:
extractor = BertContiguousKExtractor(model_bert, tokenizer, 50)
#The feature representations produced by BERT from the rationales
#Extract from the test
output = []
for i in range(0, 60):
    if i % 20 == 0:
        print(f"Location: {i}")
    train_ids = extractor.extract(test.iloc[i].review)
    output.append(extractor.input_ids_to_text(train_ids))

#DFs containing the rationales
rationale_df = pd.DataFrame(output)
#Add the sentiment
rationale_df = pd.concat([rationale_df, test.iloc[:60]['sentiment']],
                          axis = 1)
rationale_df.to_csv("../data/text_rationales/contiguous050_test_0060.csv", index = False)
print("Files saved successfully")

Location: 0
Location: 20
Location: 40
Files saved successfully


# Top 150 Extractor

In [101]:
extractor = BertTopKExtractor(model_bert, tokenizer, 150)
#The feature representations produced by BERT from the rationales
output = []
#Measure the time for the first 1000
for i in range(0, 1):
    if i % 50 == 0:
        print(f"Location: {i}")
    train_ids = extractor.extract(train.iloc[i].review)
    with torch.no_grad():
        output.append(model_bert(train_ids).pooler_output)
#DFs containing the 768 extracted features
train_feat_df = pd.DataFrame(torch.cat(output, dim=0))
#Add the sentiment
train_feat_df = pd.concat([train_feat_df, train['sentiment']],
                          axis = 1)
train_feat_df.to_csv("../data/foobar.csv", index = False)
print("Files saved successfully")

Location: 0
Files saved successfully


# Top 50 Extractor

In [10]:
extractor = BertTopKExtractor(model_bert, tokenizer, 50)
#The feature representations produced by BERT from the rationales
output = []
for i in range(0, 1000):
    if i % 50 == 0:
        print(f"Location: {i}")
    input_ids = extractor.extract(test.iloc[i].review)
    with torch.no_grad():
        output.append(model_bert(input_ids).pooler_output)
#DFs containing the 768 extracted features
output_df = pd.DataFrame(torch.cat(output, dim=0))
#Add the sentiment
output_df = pd.concat([output_df, test['sentiment']],
                          axis = 1)
output_df.to_csv("../data/top050_test_1000.csv", index = False)
print("Files saved successfully")

Location: 0
Location: 50
Location: 100
Location: 150
Location: 200
Location: 250
Location: 300
Location: 350
Location: 400
Location: 450
Location: 500
Location: 550
Location: 600
Location: 650
Location: 700
Location: 750
Location: 800
Location: 850
Location: 900
Location: 950
Files saved successfully


## Rationale Extractor

In [11]:
extractor = BertTopKExtractor(model_bert, tokenizer, 50)
#The feature representations produced by BERT from the rationales
#Extract from the test
output = []
for i in range(0, 60):
    if i % 50 == 0:
        print(f"Location: {i}")
    train_ids = extractor.extract(test.iloc[i].review)
    output.append(extractor.input_ids_to_text(train_ids))

#DFs containing the rationales
rationale_df = pd.DataFrame(output)
#Add the sentiment
rationale_df = pd.concat([rationale_df, test.iloc[:60]['sentiment']],
                          axis = 1)
rationale_df.to_csv("../data/text_rationales/top050_test_0060.csv", index = False)
print("Files saved successfully")

Location: 0
Location: 50
Files saved successfully


# Vanilla BERT Extractor

In [96]:
#The feature representations produced by BERT from the entire input
output = []
#Extract the 1000 test features
for i in range(0, 5):
    if i % 50 == 0:
        print(f"Location: {i}")
    query = test.iloc[i].review
    encoding = tokenizer(query, 
                            return_tensors = 'pt',
                            max_length = 512,
                            truncation = 'longest_first',
                            )
    with torch.no_grad():
        output.append(model_bert(encoding['input_ids']).pooler_output)
#DFs containing the 768 extracted features
train_feat_df = pd.DataFrame(torch.cat(output, dim=0))
#Add the sentiment
train_feat_df = pd.concat([train_feat_df, test['sentiment']],
                          axis = 1)
train_feat_df.to_csv("../data/foobar.csv", index = False)
print("Files saved successfully")

Location: 0
Files saved successfully
