In [1]:
! rm -rf fine_tuned_openthaigpt_yelp_lora openthaigpt_yelp_results wandb

In [2]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import random
from datasets import Dataset
import numpy as np

def load_and_preprocess_data(file_path, num_samples=100000):
    with open(file_path, 'r') as f:
        data = []
        for line in f:
            if random.random() < num_samples / 50000000:
                item = json.loads(line)
                item['sentiment'] = 'positive' if item['stars'] > 3 else 'negative' if item['stars'] < 3 else 'neutral'
                data.append(item)
            if len(data) >= num_samples:
                break
    return data

# Load and preprocess the data
data = load_and_preprocess_data('../yelp_academic_dataset_review.json', num_samples=100000)

# Split the data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Create datasets
train_dataset = Dataset.from_dict({
    'text': [item['text'] for item in train_data],
    'labels': [{'positive': 0, 'neutral': 1, 'negative': 2}[item['sentiment']] for item in train_data]
})

test_dataset = Dataset.from_dict({
    'text': [item['text'] for item in test_data],
    'labels': [{'positive': 0, 'neutral': 1, 'negative': 2}[item['sentiment']] for item in test_data]
})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# ... (‡πÇ‡∏Ñ‡πâ‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡πÜ ‡∏¢‡∏±‡∏á‡∏Ñ‡∏á‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°)

# ‡∏´‡∏•‡∏±‡∏á‡∏à‡∏≤‡∏Å‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏™‡∏£‡πá‡∏à
# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞ tokenizer
model_save_path = "./restaurant_review_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡πÇ‡∏°‡πÄ‡∏î‡∏•
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞ tokenizer ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
from transformers import AutoModelForSequenceClassification, AutoTokenizer

loaded_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

classifier = pipeline("text-classification", model=loaded_model, tokenizer=loaded_tokenizer)

# ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô analyze_restaurant ‡∏¢‡∏±‡∏á‡∏Ñ‡∏á‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°
def analyze_restaurant(review_text):
    result = classifier(review_text)[0]
    sentiment = result['label']
    confidence = result['score']
    
    analysis = f"Sentiment: {sentiment} (Confidence: {confidence:.2f})\n\n"
    
    if "location" in review_text.lower():
        analysis += "Location: The review mentions the restaurant's location.\n"
    
    if "parking" in review_text.lower():
        analysis += "Amenities: The restaurant has parking facilities.\n"
    
    if "wifi" in review_text.lower() or "wi-fi" in review_text.lower():
        analysis += "Amenities: The restaurant offers Wi-Fi.\n"
    
    if "reservation" in review_text.lower():
        analysis += "Services: The restaurant accepts reservations.\n"
    
    return analysis

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
review = "The restaurant is located in downtown, with easy parking. The food was amazing and the service was excellent. They also have free Wi-Fi for customers."
print(analyze_restaurant(review))

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11079/11079 [00:02<00:00, 4433.64 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2770/2770 [00:00<00:00, 5267.39 examples/s]


Number of training samples: 11079
Number of testing samples: 2770


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maekanun2020[0m. Use [1m`wandb login --relogin`[0m to force relogin


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.313548,0.880505
2,0.484100,0.280484,0.888087
3,0.264000,0.318758,0.900361


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Evaluation results: {'eval_loss': 0.318758100271225, 'eval_accuracy': 0.9003610014915466, 'eval_runtime': 18.841, 'eval_samples_per_second': 147.02, 'eval_steps_per_second': 6.157, 'epoch': 3.0}
Sentiment: LABEL_0 (Confidence: 1.00)

Amenities: The restaurant has parking facilities.
Amenities: The restaurant offers Wi-Fi.



In [3]:
# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
review = "‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ô‡∏µ‡πâ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏°‡∏°‡∏≤‡∏Å! ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏™‡∏î‡πÉ‡∏´‡∏°‡πà ‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥‡πÄ‡∏Ç‡πâ‡∏°‡∏Ç‡πâ‡∏ô ‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡∏ó‡∏±‡∏ö‡πÉ‡∏à ‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏Å‡∏≤‡∏®‡∏™‡∏ö‡∏≤‡∏¢‡πÜ ‡∏£‡∏≤‡∏Ñ‡∏≤‡∏™‡∏°‡πÄ‡∏´‡∏ï‡∏∏‡∏™‡∏°‡∏ú‡∏• ‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏•‡∏±‡∏ö‡∏°‡∏≤‡∏≠‡∏µ‡∏Å‡πÅ‡∏ô‡πà‡∏ô‡∏≠‡∏ô!"
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 0.58)




In [20]:
# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
review = "‡πÅ‡∏¢‡πà‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î! ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏£‡∏™‡∏ä‡∏≤‡∏ï‡∏¥‡πÅ‡∏¢‡πà ‡∏û‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏´‡∏¢‡∏≤‡∏ö‡∏Ñ‡∏≤‡∏¢ ‡∏£‡∏≤‡∏Ñ‡∏≤‡πÅ‡∏û‡∏á‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ ‡∏™‡∏†‡∏≤‡∏û‡∏£‡πâ‡∏≤‡∏ô‡∏™‡∏Å‡∏õ‡∏£‡∏Å ‡πÑ‡∏°‡πà‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡πÄ‡∏•‡∏¢"
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 0.41)




In [21]:
review = "Worst ever! The food tastes terrible, the staff is rude, prices are way too high, and the restaurant is dirty. Absolutely not recommended!"
print(analyze_restaurant(review))

Sentiment: LABEL_2 (Confidence: 0.99)




In [22]:
review = '''Worst ever! The food tastes terrible, 
the staff is rude, prices are way too high, and the restaurant is dirty. 
Absolutely not recommended!'''
print(analyze_restaurant(review))

Sentiment: LABEL_2 (Confidence: 0.99)




In [23]:
review = "Absolutely amazing experience! The food was delicious, service impeccable, and the ambiance perfect for a romantic dinner. Every dish we tried was a masterpiece. Definitely coming back soon!"
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 1.00)




In [28]:
review = "Definitely coming back soon!"
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 1.00)




In [29]:
review = "If you're visiting, don't miss their signature dish - the truffle risotto. It's creamy, aromatic, and absolutely delicious. Make sure to book in advance as it gets pretty busy, especially on weekends."
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 1.00)




In [30]:
review = "The food here is excellent, especially the steaks. However, portion sizes are a bit small for the price. The atmosphere is great, but it can get noisy when busy. They could improve by adding more vegetarian options to the menu."
print(analyze_restaurant(review))

Sentiment: LABEL_1 (Confidence: 0.63)




In [31]:
review = "I've tried most Italian restaurants in town, and this one tops the list. The pasta is homemade and cooked to perfection - way better than the overcooked noodles at that place down the street. Prices are similar to other upscale Italian joints, but the quality here justifies every penny."
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 0.99)




In [32]:
review = "I visited this charming little bistro last night and was pleasantly surprised. The decor is rustic yet elegant, creating a cozy atmosphere. We started with the bruschetta, which was fresh and flavorful. For the main course, I had the grilled salmon - perfectly cooked and seasoned. My partner enjoyed the mushroom risotto, rich and creamy. The wine list is impressive, with a good selection of both local and imported wines. Service was attentive without being intrusive. Prices are on the higher side, but justified by the quality. One small issue - the wait time for dessert was a bit long. Overall, a great spot for a special dinner."
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 1.00)




In [33]:
review = "Great food, friendly staff, nice ambiance. A bit pricey but worth it."
print(analyze_restaurant(review))

Sentiment: LABEL_0 (Confidence: 0.99)




In [34]:
train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 11079
})

In [38]:
train_data[347]

{'review_id': 'w1rA19kVKKsWz8gTZB1F8A',
 'user_id': 'WCHkk639H3ygjRSliIeWqQ',
 'business_id': 'PBtZNTQl5tRReeC8It1G7g',
 'stars': 5.0,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': "Love this place! My first visit was for a work conference more than 2 years ago, and now its my favorite vacation spot. I've stayed in every type of room and my favorite is the junior suite. It's large enough to bring your friends and share the room for a long weekend, and the balcony view of the bay plus the quiet and beautiful setting farther down from the pool make it worth the extra few dollars.\n\nEvery time we visit we note how pleasant and helpful the staff are to us. Violet in the women's lounge made our current visit absolutely amazing! Our room service staff have been top notch, providing extras of our favorites when asked. \n\nThe spa amenities are lovely and relaxing. Multiple pools are available inside and out, sauna and steam rooms, amazing shower products are complimentary in the showers. I