In [1]:
! rm -rf results/ wandb/ fine_tuned_model_10k/ fine_tuned_model_improved/

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

# ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.cuda.set_device(0)

# 1. ‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡∏∞‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
def load_data(business_file, review_file, sample_size=1000000):
    # ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• business
    businesses = {}
    with open(business_file, 'r') as f:
        for line in f:
            business = json.loads(line)
            businesses[business['business_id']] = business

    # ‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡∏∞‡∏™‡∏∏‡πà‡∏°‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• review
    reviews = []
    with open(review_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= sample_size:
                break
            review = json.loads(line)
            if review['business_id'] in businesses:
                reviews.append(review)

    # ‡∏™‡∏£‡πâ‡∏≤‡∏á DataFrame
    df = pd.DataFrame(reviews)
    
    # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• business ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ‡πÉ‡∏ô DataFrame (‡∏¢‡∏Å‡πÄ‡∏ß‡πâ‡∏ô business_name)
    df['categories'] = df['business_id'].map(lambda x: businesses[x].get('categories', ''))
    df['city'] = df['business_id'].map(lambda x: businesses[x]['city'])

    # ‡∏™‡∏£‡πâ‡∏≤‡∏á full_text ‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ business_name
    ### df['full_text'] = df.apply(lambda row: f"‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó: {row['categories']}\n‡πÄ‡∏°‡∏∑‡∏≠‡∏á: {row['city']}\n‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô: {row['stars']}/5\n‡∏£‡∏µ‡∏ß‡∏¥‡∏ß: {row['text']}\n\n", axis=1)
    ### ‡∏ï‡∏±‡∏î catagory ‡∏≠‡∏≠‡∏Å ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏Ç‡∏≠‡∏á model
    df['full_text'] = df.apply(lambda row: f"‡πÄ‡∏°‡∏∑‡∏≠‡∏á: {row['city']}\n‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô: {row['stars']}/5\n‡∏£‡∏µ‡∏ß‡∏¥‡∏ß: {row['text']}\n\n", axis=1)
    
    return df, businesses

# ‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡∏∞‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
df, businesses = load_data('../yelp_academic_dataset_business.json', '../yelp_academic_dataset_review.json', sample_size=1000000)
print(f"‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏µ‡πà‡πÇ‡∏´‡∏•‡∏î: {len(df)}")

# 2. ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö fine-tuning
df['label'] = df['business_id']
label_to_id = {label: id for id, label in enumerate(df['label'].unique())}
id_to_label = {id: label for label, id in label_to_id.items()}
df['label_id'] = df['label'].map(label_to_id)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 3. ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° Dataset
train_dataset = Dataset.from_pandas(train_df[['full_text', 'label_id']])
val_dataset = Dataset.from_pandas(val_df[['full_text', 'label_id']])

# 4. ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_prepare(examples):
    tokenized = tokenizer(examples['full_text'], padding="max_length", truncation=True, max_length=512)
    tokenized['labels'] = examples['label_id']
    return tokenized

tokenized_train = train_dataset.map(tokenize_and_prepare, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_and_prepare, batched=True, remove_columns=val_dataset.column_names)

# 5. ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞ Trainer
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))
model.to('cuda:0')

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    remove_unused_columns=False,
    no_cuda=False,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# 6. Fine-tune ‡πÇ‡∏°‡πÄ‡∏î‡∏•
print("‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£ fine-tune ‡πÇ‡∏°‡πÄ‡∏î‡∏•...")
trainer.train()
print("Fine-tune ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô")

# 7. ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•
trainer.save_model("./fine_tuned_model_improved")
print("‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢")

  from .autonotebook import tqdm as notebook_tqdm


‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏µ‡πà‡πÇ‡∏´‡∏•‡∏î: 1000000


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800000/800000 [03:36<00:00, 3693.46 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200000/200000 [00:51<00:00, 3897.65 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£ fine-tune ‡πÇ‡∏°‡πÄ‡∏î‡∏•...


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maekanun2020[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,3.6092,3.684651,0.392415,0.31343,0.314384,0.392415


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
# 8. ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£ matching
def match_business_to_text(user_text, top_n=5):
    inputs = tokenizer(user_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to('cuda:0')
    with torch.no_grad():
        outputs = model(**inputs)
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    top_n_probs, top_n_indices = torch.topk(probabilities, top_n)
    
    results = []
    for prob, idx in zip(top_n_probs[0], top_n_indices[0]):
        business_id = id_to_label[idx.item()]
        print ("MODEL's ANSWER ============> ", business_id)
        business_info = businesses[business_id]
        results.append({
            'business_id': business_id,
            'name': business_info['name'],
            'categories': business_info.get('categories', ''),
            'city': business_info['city'],
            'probability': prob.item()
        })
    
    return results

# 9. ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå
def display_results(results):
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['name']}")
        print(f"   Business ID: {result['business_id']}")
        print(f"   Categories: {result['categories']}")
        print(f"   City: {result['city']}")
        print(f"   Probability: {result['probability']:.4f}")
        print(f"   Match level: {get_match_level(result['probability'])}")
        
        # ‡πÅ‡∏™‡∏î‡∏á‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏Ç‡∏≠‡∏á‡∏ò‡∏∏‡∏£‡∏Å‡∏¥‡∏à‡∏ô‡∏µ‡πâ
        print("\n   Reviews:")
        business_reviews = df[df['business_id'] == result['business_id']]['text'].tolist()
        for j, review in enumerate(business_reviews[:3], 1):  # ‡πÅ‡∏™‡∏î‡∏á 3 ‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡πÅ‡∏£‡∏Å
            print(f"   Review {j}: {review[:200]}...")  # ‡πÅ‡∏™‡∏î‡∏á 200 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÅ‡∏£‡∏Å‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏£‡∏µ‡∏ß‡∏¥‡∏ß
        
        print("\n" + "="*50 + "\n")

def get_match_level(probability):
    if probability > 0.8:
        return "Very High"
    elif probability > 0.6:
        return "High"
    elif probability > 0.4:
        return "Moderate"
    elif probability > 0.2:
        return "Low"
    else:
        return "Very Low"

In [None]:
# 10. ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
user_text = "‡∏â‡∏±‡∏ô‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏≠‡∏¥‡∏ï‡∏≤‡πÄ‡∏•‡∏µ‡∏¢‡∏ô‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏û‡∏≤‡∏™‡∏ï‡πâ‡∏≤‡πÅ‡∏•‡∏∞‡∏û‡∏¥‡∏ã‡∏ã‡πà‡∏≤‡∏≠‡∏£‡πà‡∏≠‡∏¢‡πÜ"
results = match_business_to_text(user_text)

print(f"User Text: {user_text}\n")
print("Top 5 Matching Businesses:")
display_results(results)

In [None]:
# 10. ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
user_text = "I'm looking for an Italian restaurant with delicious pasta and pizza."
results = match_business_to_text(user_text)

print(f"User Text: {user_text}\n")
print("Top 5 Matching Businesses:")
display_results(results)