In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
distil_bert = 'distilbert-base-uncased'

In [None]:
df_original = pd.read_excel("primary_intent_up_english_after_Datathon2.xlsx")

In [None]:
df = df_original[:1000]

In [None]:
df.head()

Unnamed: 0,Text,Intent
0,Sec 18,Store
1,Store,Store
2,Store Near Me,Store
3,Stores Near Me,Store
4,Nearest Store to me,Store


In [None]:
df.sort_values('Text',inplace=True, ascending=True)
duplicated_text_series = df.duplicated('Text', keep = 'first')
df = df[~duplicated_text_series]
print("Total number of text after removing duplicates:", df.shape[0])

Total number of text after removing duplicates: 841


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df.head(20)

Unnamed: 0,Text,Intent
442,Appointment,Store
29,Assisted shopping is there ?,Store
918,Band-collar shirt for women,Product
807,Band-collar tshirt for men,Product
134,Black,Product
132,Black Shoe,Product
193,Black loafers,Product
167,Black shoes,Product
219,Black slippers,Product
551,Blue color flat sandals,Product


In [None]:
import string

In [None]:
text = "stores near me?"

In [None]:
def punctuation_removal(text):
  text = [i for i in text if i not in string.punctuation]
  text = "".join(text)
  return text

In [None]:
df["Text"] = df["Text"].apply(punctuation_removal)

In [None]:
df["Intent"].value_counts()

Product    667
Store      106
Loyalty     68
Name: Intent, dtype: int64

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
tokenized = df["Text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
max_len = 50
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(841, 50)

In [None]:
input_ids =tf.convert_to_tensor(padded)
attention_mask = tf.convert_to_tensor(attention_mask)

In [None]:
last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = df["Intent"]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42)

In [None]:
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB()
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import svm
# lr_clf = GaussianNB()
# lr_clf.fit(train_features, train_labels)
# lr_clf = MultinomialNB().fit(train_features,train_labels)
lr_clf=LinearSVC(class_weight='balanced',random_state=42)
#lr_clf=svm.SVC(kernel = 'poly', C = 10, probability=True,class_weight='balanced')
lr_clf.fit(train_features, train_labels)



LinearSVC(class_weight='balanced', random_state=42)

In [None]:
#lr_clf2=LinearSVC(class_weight='balanced')
#lr_clf2.fit(train_features, train_labels)

In [None]:
#lr_clf = LogisticRegression(multi_class = "multinomial", random_state=42)
#lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

1.0

In [None]:
y_pred=lr_clf.predict(test_features)

In [None]:
#y_pred=lr_clf2.predict(test_features)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(test_labels, y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(test_labels, y_pred))

print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

[[ 14   0   0]
 [  0 174   0]
 [  0   0  23]]
              precision    recall  f1-score   support

     Loyalty       1.00      1.00      1.00        14
     Product       1.00      1.00      1.00       174
       Store       1.00      1.00      1.00        23

    accuracy                           1.00       211
   macro avg       1.00      1.00      1.00       211
weighted avg       1.00      1.00      1.00       211

Accuracy: 1.0


In [None]:
#confusion_matrix = confusion_matrix(test_labels, y_pred)
#print(confusion_matrix)
#print(classification_report(test_labels, y_pred))
#print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

In [None]:
def feature_gen(text):
    text = [i for i in text if i not in string.punctuation]
    text = "".join(text)
    dummy = tokenizer.encode(text,add_special_tokens=True)
    dummy_padded = np.array([dummy + [0]*(max_len-len(dummy))])
    dummy_attention_mask = np.where(dummy_padded != 0, 1, 0)
    dummy_input_ids =tf.convert_to_tensor(dummy_padded)
    dummy_attention_mask = tf.convert_to_tensor(dummy_attention_mask)
    dummy_last_hidden_states = model(dummy_input_ids, attention_mask=dummy_attention_mask)
    dummy_features = dummy_last_hidden_states[0][:,0,:].numpy()
    return dummy_features

In [None]:
#from sklearn.linear_model import SGDClassifier
#from sklearn.calibration import CalibratedClassifierCV

In [None]:
#clf = SGDClassifier(class_weight='balanced', penalty='l2', loss='hinge', random_state=42)
#clf.fit(train_features, train_labels)
#calib_clf = CalibratedClassifierCV(clf, method="sigmoid")
#calib_clf.fit(train_features, train_labels)

In [None]:
#calib_clf.score(test_features, test_labels)

In [None]:
train_labels.value_counts()

Product    493
Store       83
Loyalty     54
Name: Intent, dtype: int64

In [None]:
#calib_clf.classes_

In [None]:
#calib_clf.predict_proba(feature_gen("bataclub")).argmax()

In [None]:
#calib_clf.predict(feature_gen("bataclub"))

In [None]:
# import pickle
# filename = 'finalized_model_english_after_Datathon.sav'
# lr_clf = pickle.load(open(filename, 'rb'))

In [None]:
lr_clf.classes_

array(['Loyalty', 'Product', 'Store'], dtype=object)

In [None]:
lr_clf.predict(feature_gen("bata shoe size"))

array(['Product'], dtype=object)

In [None]:
#lr_clf.predict_proba(feature_gen("namas")).max()

In [None]:
lr_clf.predict(feature_gen("nadia"))[0]

'Store'

In [None]:
lr_clf.predict(feature_gen("store near me"))[0]

'Store'

In [None]:
df[df['Intent']=='Store']

Unnamed: 0,Text,Intent
442,Appointment,Store
29,Assisted shopping is there,Store
34,Distance of Outlet,Store
408,Distance of Shop,Store
352,Distance of store,Store
...,...,...
422,what are the Shop timings,Store
20,what are the store timings,Store
49,when to visit Outlet,Store
76,when to visit Shop,Store


In [None]:
#lr_clf.predict(feature_gen("I want to Shopping"))[0]

In [None]:
lr_clf.predict(feature_gen("how to enroll for membership"))[0]

'Loyalty'

In [None]:
lr_clf.predict(feature_gen("navigate me to near by store"))[0]

'Store'

In [None]:
lr_clf.predict(feature_gen("peter england red t-shirt"))[0]

'Product'

In [None]:
lr_clf.predict(feature_gen("store with parking facility"))[0]

'Store'

### Persisting

In [None]:
import pickle
#import numpy
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
distil_bert = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)
model.save_pretrained('distilbert-base-uncased')
tokenizer.save_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


('distilbert-base-uncased/tokenizer_config.json',
 'distilbert-base-uncased/special_tokens_map.json',
 'distilbert-base-uncased/vocab.txt',
 'distilbert-base-uncased/added_tokens.json')

#### Extras

In [None]:
lr_clf.predict(feature_gen("need to check"))

array(['Store'], dtype=object)

In [None]:
lr_clf.predict(feature_gen("completely lost"))

array(['Store'], dtype=object)

In [None]:
lr_clf.predict(feature_gen("what's the need"))

array(['Store'], dtype=object)

In [None]:
lr_clf.predict(feature_gen("why should I"))

array(['Product'], dtype=object)

In [None]:
import string

In [None]:
lr_clf.predict(feature_gen("bill"))

array(['Loyalty'], dtype=object)

#### Persisitng

In [None]:
import pickle

In [None]:
# filename = '/dbfs/FileStore/tables/intent_classification/finalized_model_english_after_Datathon.sav'
# pickle.dump(lr_clf, open(filename, 'wb'))

## Loyalty Sub-Intent

In [None]:
#df = pd.read_csv("/dbfs/FileStore/tables/intent_classification/finalized_questions1.csv")
df = pd.read_csv("finalized_questions1_after_datathon.csv")

In [None]:
#df.loc[len(df.index)] = ["My loyalty", "Others"]

In [None]:
df.shape

(2285, 2)

In [None]:
df.head()

Unnamed: 0,Text,Sub-intent
0,offer,Coupons
1,Enrol,Others
2,Do you have any offer which are running on pro...,Others
3,What kind of products are available under the ...,Others
4,How can I take this offer?,Others


In [None]:
df.sort_values('Text',inplace=True, ascending=True)
duplicated_text_series = df.duplicated('Text', keep = False)
df = df[~duplicated_text_series]
print("Total number of text after removing duplicates:", df.shape[0])

Total number of text after removing duplicates: 2079


In [None]:
df["Text"] = df["Text"].apply(punctuation_removal)

In [None]:
df["Sub-intent"].value_counts()

Points     755
Coupons    734
Others     590
Name: Sub-intent, dtype: int64

In [None]:
df["Text"] = df["Text"].astype(str)
df["Sub-intent"] = df["Sub-intent"].astype(str)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)

All model checkpoint layers were used when initializing TFDistilBertModel.

All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
tokenized = df["Text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
max_len = 50
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2079, 50)

In [None]:
input_ids =tf.convert_to_tensor(padded)
attention_mask = tf.convert_to_tensor(attention_mask)

In [None]:
last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = df["Sub-intent"]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42)

In [None]:
train_features.shape

(1559, 768)

In [None]:
train_labels.shape

(1559,)

In [None]:
#from sklearn.naive_bayes import GaussianNB
#lr_clf = GaussianNB()
#lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf=LinearSVC(class_weight='balanced',random_state=44)
lr_clf.fit(train_features, train_labels)



LinearSVC(class_weight='balanced', random_state=44)

In [None]:
# lr_clf = LogisticRegression(multi_class = "multinomial",random_state=42)
# lr_clf.fit(train_features, train_labels)

In [None]:
import pickle
filename = '/dbfs/FileStore/tables/intent_classification/finalized_loyalty_subintent_after_datathon.sav'
lr_clf = pickle.load(open(filename, 'rb'))

In [None]:
lr_clf.score(test_features, test_labels)

0.9230769230769231

In [None]:
def feature_gen(text):
    text = [i for i in text if i not in string.punctuation]
    text = "".join(text)
    dummy = tokenizer.encode(text,add_special_tokens=True)
    dummy_padded = np.array([dummy + [0]*(max_len-len(dummy))])
    dummy_attention_mask = np.where(dummy_padded != 0, 1, 0)
    dummy_input_ids =tf.convert_to_tensor(dummy_padded)
    dummy_attention_mask = tf.convert_to_tensor(dummy_attention_mask)
    dummy_last_hidden_states = model(dummy_input_ids, attention_mask=dummy_attention_mask)
    dummy_features = dummy_last_hidden_states[0][:,0,:].numpy()
    return dummy_features

In [None]:
lr_clf.predict(feature_gen("offers"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("want to know about redemption of gift vouchers"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("Can I generate gift voucher online?"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("gift vouchers"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("vouchers"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("point"))[0]

'Points'

In [None]:
lr_clf.predict(feature_gen("what different loyalty programme you have"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("on which date did I enroll"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("can I redeem points on above 4000 transactions"))[0]

'Points'

In [None]:
lr_clf.predict(feature_gen("in which month did I enroll"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("how many rewards will I get"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("rewards"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("reward"))[0]

'Coupons'

In [None]:
lr_clf.predict(feature_gen("super coins"))[0]

'Points'

In [None]:
lr_clf.predict(feature_gen("how to enroll for loyalty"))[0]

In [None]:
lr_clf.predict(feature_gen("voucher"))[0]

'Coupons'

### Fails

In [None]:
lr_clf.predict(feature_gen("any offers for me"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("any discount on amount"))[0]

'Points'

In [None]:
lr_clf.predict(feature_gen("How can I apply offer"))[0]

'Others'

In [None]:
lr_clf.predict(feature_gen("any offer on amount"))[0]

'Others'