In [26]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from data_preprocessing import *

In [27]:
data = pd.read_excel(open('Consolidated.xlsx', 'rb'), sheet_name='Consolidated')
data

Unnamed: 0,S.No,Filepath,Foldername,Filename,Hotel,Review,Target
0,1,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_1.txt,hilton,We stay at Hilton for 4 nights last march. It ...,True Positive
1,2,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_10.txt,hilton,This is a stunning hotel in an excellent locat...,True Positive
2,3,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_11.txt,hilton,Staying at this hotel was one of the high poin...,True Positive
3,4,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_12.txt,hilton,"went to chicago for a week in may, decided to ...",True Positive
4,5,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_13.txt,hilton,We stayed here from Nov. 30 to Dec 2 and had a...,True Positive
...,...,...,...,...,...,...,...
3195,3196,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_5.txt,palmer,"Overall, the hotel was okay. Though I did have...",False Negative
3196,3197,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_6.txt,palmer,I was not pleased with my recent stay at the P...,False Negative
3197,3198,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_7.txt,palmer,Our visit started off on the wrong foot when w...,False Negative
3198,3199,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_8.txt,palmer,"Though grand and having a brand, this hotel se...",False Negative


In [28]:
# Convert labels to numerical values
data['Target'] = data['Target'].astype('category')
data['label'] = pd.Categorical(data['Target'])
data['label'] = data['Target'].cat.codes
data

Unnamed: 0,S.No,Filepath,Foldername,Filename,Hotel,Review,Target,label
0,1,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_1.txt,hilton,We stay at Hilton for 4 nights last march. It ...,True Positive,3
1,2,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_10.txt,hilton,This is a stunning hotel in an excellent locat...,True Positive,3
2,3,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_11.txt,hilton,Staying at this hotel was one of the high poin...,True Positive,3
3,4,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_12.txt,hilton,"went to chicago for a week in may, decided to ...",True Positive,3
4,5,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_13.txt,hilton,We stayed here from Nov. 30 to Dec 2 and had a...,True Positive,3
...,...,...,...,...,...,...,...,...
3195,3196,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_5.txt,palmer,"Overall, the hotel was okay. Though I did have...",False Negative,0
3196,3197,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_6.txt,palmer,I was not pleased with my recent stay at the P...,False Negative,0
3197,3198,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_7.txt,palmer,Our visit started off on the wrong foot when w...,False Negative,0
3198,3199,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_8.txt,palmer,"Though grand and having a brand, this hotel se...",False Negative,0


In [29]:
le = joblib.load('saved_models/le.pkl')
df_final_test = data.query('Foldername == "fold1"')
df = data.query('Foldername != "fold1"')
df['Review'] = df['Review'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review'] = df['Review'].apply(preprocess_text)


In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

def pre_process_data(X, y, max_seq, num_classes, token_model_save=True):
    tokenizer = Tokenizer()
    if token_model_save:
        tokenizer.fit_on_texts(X)
        with open('saved_models/tokenizer.pickle', 'wb') as f:
            pickle.dump(tokenizer, f) 
    else:
        with open('saved_models/tokenizer.pickle', 'rb') as f:
            tokenizer = pickle.load(f)
    X_train_sequences = tokenizer.texts_to_sequences(X)
    # max_seq_length = 250  # Choose the maximum sequence length
    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_seq, padding='post')
    target_labels = tf.keras.utils.to_categorical(le.transform(y), num_classes)
    return X_train_padded, target_labels

In [31]:
num_classes = 4

max_seq_length = 250
X, y = pre_process_data(df['Review'].apply(preprocess_text), df['Target'], max_seq_length, num_classes)

In [32]:
# Split data into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Review'], df['label'], test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)


In [33]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
# Tokenize text data
def tokenize_text(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = 64,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_input_ids, train_attention_masks = tokenize_text(train_texts)
val_input_ids, val_attention_masks = tokenize_text(val_texts)
test_input_ids, test_attention_masks = tokenize_text(test_texts)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [35]:
# Convert labels to tensors

train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

In [36]:
# Create DataLoader for train, validation, and test sets
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [37]:
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)


In [38]:
# Define BERT model for binary classification
class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 1)  # BERT-base has 768 hidden units

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

In [39]:
# Initialize BERT model and optimizer
model = BERTClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)


In [40]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(logits.squeeze(-1), labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

In [None]:
# Validation loop
model.eval()
val_preds = []
val_targets = []
with torch.no_grad():
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        logits = model(input_ids, attention_mask)
        preds = torch.sigmoid(logits.squeeze(-1))
        val_preds.extend(preds.cpu().numpy())
        val_targets.extend(labels.cpu().numpy())
val_preds = np.array(val_preds) >= 0.5
val_accuracy = accuracy_score(val_targets, val_preds)
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss}, Val Accuracy: {val_accuracy}')

Epoch 3/3, Train Loss: -10.2992251932621, Val Accuracy: 0.340625


In [None]:
# Evaluation on test set
model.eval()
test_preds = []
test_targets = []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        logits = model(input_ids, attention_mask)
        preds = torch.sigmoid(logits.squeeze(-1))
        test_preds.extend(preds.cpu().numpy())
        test_targets.extend(labels.cpu().numpy())
test_preds = np.array(test_preds) >= 0.5
test_accuracy = accuracy_score(test_targets, test_preds)
print(f'Test Accuracy: {test_accuracy}')
print(f'Target: {test_targets[0]},  predicted: {test_preds[0]}')

Test Accuracy: 0.36875
Target: 1,  predicted: True
