We import ‘tensorflow' to access the core TensorFlow functionality, ‘Tokenizer' to preprocess text data by tokenizing it into sequences, and ‘pad_sequences' to pad the sequences to a fixed length.

In [49]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np 
import random 
import json 
  
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd 
from datasets import load_dataset

In [50]:
# Loading Amazon Intent Dataset 
dataset = load_dataset("SetFit/amazon_massive_intent_en-US")
print(dataset.shape)

# # Separating out the datasets 
training_set = dataset['train'].to_pandas()
test_set = dataset['test'].to_pandas()

training_set.head()

{'train': (11514, 4), 'validation': (2033, 4), 'test': (2974, 4)}


Unnamed: 0,id,label,text,label_text
0,1,48,wake me up at nine am on friday,alarm_set
1,2,48,set an alarm for two hours from now,alarm_set
2,4,46,olly quiet,audio_volume_mute
3,5,46,stop,audio_volume_mute
4,6,46,olly pause for ten seconds,audio_volume_mute


In [76]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess_func (text):
   # remove stop words and lemmatize the text
    doc = nlp(text)
    stop_words = [ 'i', 'the', "but", 'no', 'or', 'a', 'an', "to", "my", "me"]
    filtered_tokens = []
    for token in doc:
        if str(token).lower() in stop_words or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [77]:
# Finding out what are the top labels of the entire dataset
#  Will limit analysis to these labels  
concat_data = pd.concat([training_set ,test_set])
print("Combined DF shape:", concat_data.shape)
top_15_labels = list(concat_data["label_text"].value_counts().head(15).index)

Combined DF shape: (8435, 5)


In [78]:
# Filtering dataset to only have top 15 labels:
training_set = training_set[training_set['label_text'].isin(top_15_labels)]
test_set = test_set[test_set['label_text'].isin(top_15_labels)]

# Applying Preprocessing 
training_set['preprocessed_txt'] = training_set['text'].apply(preprocess_func)
test_set['preprocessed_txt'] = test_set['text'].apply(preprocess_func)

# Separating X_train, X_test, y_train, and y_test
X_train = training_set['preprocessed_txt']
y_train = training_set['label']

X_test = test_set['preprocessed_txt']
y_test = test_set['label']


In [79]:
print(X_train.shape)
print(y_train.shape)
print("\n")
print(X_test.shape)
print(y_test.shape)

(6664,)
(6664,)


(1771,)
(1771,)


In [80]:
print("Before preprocessing:")
training_set['text'][:10]

Before preprocessing:


21                           check when the show starts
22        i want to listen arijit singh song once again
23                  i want to play that music one again
24                                check my car is ready
25                           check my laptop is working
26           is the brightness of my screen running low
27    i need to have location services on can you check
28                   check the status of my power usage
29                   i am not tired i am actually happy
30              olly i am not tired i am actually happy
Name: text, dtype: object

In [81]:
print("After preprocessing:")
X_train[:10]

After preprocessing:


21                          check when show start
22       want listen arijit singh song once again
23                 want play that music one again
24                             check car be ready
25                           check laptop be work
26                be brightness of screen run low
27    need have location service on can you check
28                    check status of power usage
29                 be not tired be actually happy
30            olly be not tired be actually happy
Name: preprocessed_txt, dtype: object

In [82]:
y_train[:10]

21    32
22    45
23    45
24    12
25    12
26    12
27    12
28    12
29    12
30    12
Name: label, dtype: int64

#### Approach #1: sklearn 

In [103]:
len(X_train.values)

6664

In [119]:
# Create bag of words representation using CountVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# v = CountVectorizer(ngram_range=(1,2))

# Create TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<6664x3418 sparse matrix of type '<class 'numpy.float64'>'
	with 38599 stored elements in Compressed Sparse Row format>

In [120]:
X_train_cv.toarray()[:2][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [121]:
print(X_train_cv.shape)

print(y_train.shape)

(6664, 3418)
(6664,)


In [122]:
v.get_feature_names_out()[1226]

'global'

In [123]:
#v.vocabulary_

In [143]:
# Train the naive bayes model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [136]:
X_test_cv = v.transform(X_test)

In [144]:
# Evaluate the performance

from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.75      0.85        88
           9       0.98      0.60      0.74        72
          12       0.60      0.47      0.53       169
          13       0.85      0.97      0.90       156
          22       0.86      0.86      0.86       124
          26       0.97      0.63      0.77        57
          30       0.96      0.72      0.82        67
          32       0.59      0.70      0.64       126
          33       0.96      0.81      0.88       114
          36       0.98      0.65      0.78        72
          44       0.89      0.93      0.91       119
          45       0.83      0.97      0.89       176
          47       1.00      0.81      0.90        81
          49       0.80      0.70      0.75       141
          50       0.62      0.95      0.75       209

    accuracy                           0.79      1771
   macro avg       0.86      0.77      0.80      1771
weighted avg       0.82   