In [7]:
# pip install transformers pandas datasets
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('yelp_review_full')
df = dataset['train'].to_pandas()

#randly select 100 samples
df = df.sample(100)
print(df)

        label                                               text
433435      4  I've been going here for about 4 yrs.Cleanest ...
402242      1  We've wanted to try Thai E-San since they open...
632177      0  The food is ok but the service is terrible!!!!...
595983      3  There's typically a 15-20 minute wait when we ...
551785      3  Five guys offers a basic selection of burgers,...
...       ...                                                ...
140694      3  Good service, good food, good beer, and textbo...
381014      1  The quality of the food here is pretty consist...
328741      1  We arrived at approximately 8:15pm on a Saturd...
573119      0  I was really disappointed about our experience...
476655      2  I had been to Chic Nails once before.  They ha...

[100 rows x 2 columns]


In [8]:
import pandas as pd
# Manually add categories to each review and import the dataset
df = pd.read_csv('yelp_review_full.csv')
# Define categories
classes = ["Restaurants", "Bars", "Coffee Shops", "Hotels", "Salons/Barbershops",
           "Auto Repair", "Home Services", "Medical Services", "Entertainment", "Pet Services",
           "Financial Services", "Travel & Tourism", "Education", "Real Estate", "Fitness",
           "Landscaping & Gardening Services", "Legal Services", "Photography Services",
           "Childcare Services", "Computer & Technology Services"]

Manual Categorization

Model process

In [9]:
from transformers import pipeline

# Model 1: facebook/bart-large-mnli
classifier_1 = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
def model1_review(text):
    result = classifier_1(text, candidate_labels=classes, multi_label=True)
    return result['labels'][0]  # Returning the top category

# Apply the function to each review in the DataFrame
df['model1'] = df['text'].apply(model1_review)
print(df.head())

   label                                               text  \
0      4  I've been going here for about 4 yrs.Cleanest ...   
1      1  We've wanted to try Thai E-San since they open...   
2      0  The food is ok but the service is terrible!!!!...   
3      3  There's typically a 15-20 minute wait when we ...   
4      3  Five guys offers a basic selection of burgers,...   

      Manual_Category         model1  
0  Salons/Barbershops  Entertainment  
1         Restaurants    Restaurants  
2         Restaurants    Restaurants  
3         Restaurants    Restaurants  
4         Restaurants    Restaurants  


In [10]:
# Model 2: knowledgator/comprehend_it-multilingual-t5-base
# pip install liqfit sentencepiece
from liqfit.pipeline import ZeroShotClassificationPipeline
from liqfit.models import T5ForZeroShotClassification
from transformers import T5Tokenizer

model = T5ForZeroShotClassification.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
tokenizer = T5Tokenizer.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
classifier_2 = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer,
                                                      hypothesis_template = '{}', encoder_decoder = True)
def model2_review(text):
    result = classifier_2(text, candidate_labels=classes, multi_label=True)
    return result['labels'][0]  # Returning the top category

# Apply the function to each review in the DataFrame
df['model2'] = df['text'].apply(model2_review)
print(df.head())

You are using a model of type T5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


   label                                               text  \
0      4  I've been going here for about 4 yrs.Cleanest ...   
1      1  We've wanted to try Thai E-San since they open...   
2      0  The food is ok but the service is terrible!!!!...   
3      3  There's typically a 15-20 minute wait when we ...   
4      3  Five guys offers a basic selection of burgers,...   

      Manual_Category         model1         model2  
0  Salons/Barbershops  Entertainment  Home Services  
1         Restaurants    Restaurants   Pet Services  
2         Restaurants    Restaurants    Restaurants  
3         Restaurants    Restaurants    Restaurants  
4         Restaurants    Restaurants    Restaurants  


In [11]:
# Model 3: MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
# pip install transformers[sentencepiece]
from transformers import pipeline
classifier_3 = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
def model3_review(text):
    result = classifier_3(text, candidate_labels=classes, multi_label=True)
    return result['labels'][0]  # Returning the top category

# Apply the function to each review in the DataFrame
df['model3'] = df['text'].apply(model3_review)
print(df.head())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


   label                                               text  \
0      4  I've been going here for about 4 yrs.Cleanest ...   
1      1  We've wanted to try Thai E-San since they open...   
2      0  The food is ok but the service is terrible!!!!...   
3      3  There's typically a 15-20 minute wait when we ...   
4      3  Five guys offers a basic selection of burgers,...   

      Manual_Category         model1         model2              model3  
0  Salons/Barbershops  Entertainment  Home Services  Salons/Barbershops  
1         Restaurants    Restaurants   Pet Services         Restaurants  
2         Restaurants    Restaurants    Restaurants         Restaurants  
3         Restaurants    Restaurants    Restaurants         Restaurants  
4         Restaurants    Restaurants    Restaurants         Restaurants  


Comparison and Accuracy Calculation:

In [12]:
accuracy1 = (df['model1'] == df['Manual_Category']).mean()
accuracy2 = (df['model2'] == df['Manual_Category']).mean()
accuracy3 = (df['model3'] == df['Manual_Category']).mean()
print("Accuracy of model 1: ", accuracy1)
print("Accuracy of model 2: ", accuracy2)
print("Accuracy of model 3: ", accuracy3)

Accuracy of model 1:  0.61
Accuracy of model 2:  0.59
Accuracy of model 3:  0.73


### Model 1: facebook/bart-large-mnli
Accuracy: 61%

This model uses a combined bidirectional encoder and autoregressive decoder, trained on the MNLI dataset. Its moderate accuracy suggests that while effective at general language tasks, it may not fully capture the specific nuances of Yelp reviews.

### Model 2: knowledgator/comprehend_it-multilingual-t5-base
Accuracy: 59%

Based on the T5 architecture, which processes tasks as text generation in multiple languages, its slightly lower performance might stem from its generalist training which is not specialized enough for the particular characteristics of Yelp reviews.

### Model 3: MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
Accuracy: 73%

DeBERTa enhances BERT architectures with a novel attention mechanism and is further fine-tuned on datasets aimed at improving inference, resulting in the highest accuracy among the three models. This suggests superior handling of the complex semantics in review texts.


In summary, the varying accuracies reflect each model’s architectural strengths and training focus, with DeBERTa standing out due to its advanced inference capabilities and specialized attention mechanism.