In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
     ---------------------------------------- 84.0/84.0 kB 2.4 MB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [8]:
# Imports
%env WANDB_DISABLED=true

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    logging
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report, 
    confusion_matrix
)
import numpy as np
import pandas as pd
import torch


# If libraries aren't found, install them like cell above

In [20]:
# Read in 40,000 review balanced dataset for regions
dfSmall = pd.read_csv("C:\\Users\\zackc\\Desktop\\NLP Project\\balanced_reviews40000cased.csv")

In [21]:
# Replace whitespace characters with a single space and remove trailing whitespace
dfSmall["text"] = dfSmall["text"].str.replace(r'\s+', ' ', regex=True).str.strip()

# Mappings for labels to ids and id to labels
# Needed for BERT fine-tuning and evaluation, model expects numeric labels
label2id = {
    'Midwest': 0,
    'Northeast': 1,
    'South': 2,
    'West': 3
}

id2label = {
    0: 'Midwest',
    1: 'Northeast',
    2: 'South',
    3: 'West'
}

dfSmall["label"] = dfSmall["region"].map(label2id)

# Sanity Check
print(dfSmall.head())

# Shuffle Data
dfSmall = dfSmall.sample(frac=1, random_state=42).reset_index(drop=True)

# Sanity Check
print(dfSmall.head())



                                                text   region merged_region  \
0  Ate here on 7/22/17 at 11:30 AM.Only one emplo...  Midwest          West   
1  I come here for the crab legs. Seriously! $7.9...  Midwest          West   
2  Love this place! You get what you pay for with...  Midwest          West   
3  So much fun! A fantastically unique experience...  Midwest          West   
4  Dr. Mike reached out to me, and was very kind ...  Midwest          West   

   label  
0      0  
1      0  
2      0  
3      0  
4      0  
                                                text     region merged_region  \
0  Dr. Bream is an amazing doctor! I highly recom...       West          West   
1  Fantastic place! Great wine and liquor selecti...  Northeast          East   
2  BLUF: I'd go back if I were in the area. Glad ...      South          East   
3  I can't really top what others have said. This...    Midwest          West   
4  I have to echo all the bad reviews that have a...  

In [17]:
%env WANDB_DISABLED=true
from transformers import logging


# Convert Pandas DF to Hugging Face Dataset
dataset = Dataset.from_pandas(dfSmall[["text", "label"]])

# Load in BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)


# Tokenize and prepare dataset
dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Shuffle and split into train/test
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


# Load in BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",            
    eval_steps=200,                    
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,               
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    report_to="none",                
)




# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

#  Final eval
metrics = trainer.evaluate()
print(" Eval Output:", metrics)


# Save model and tokenizer locally
model.save_pretrained("./bert-yelp-model")
tokenizer.save_pretrained("./bert-yelp-model")


env: WANDB_DISABLED=true


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(3)


  trainer = Trainer(
  arr = np.array(obj)
  arr = np.array(obj)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [24]:
# Get predictions
predictions = trainer.predict(eval_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Classification report
print("\n Classification Report:\n")
print(classification_report(labels, preds, target_names=[id2label[i] for i in sorted(id2label.keys())]))

# Confusion matrix
print("\n Confusion Matrix:\n")
print(confusion_matrix(labels, preds))

KeyboardInterrupt: 