In [1]:
%matplotlib inline
import pandas
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from torch.utils.data import Dataset
from IPython.display import display
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [2]:
data_df = pd.read_csv('clean-phone-data-for-students.csv')

In [3]:
display(data_df.head())
data_df.describe()

Unnamed: 0,Sentence Utterance,Action,Object
0,<PHONE_NUMBER_REMOVED> ผมไปจ่ายเงินที่ Counte...,enquire,payment
1,internet ยังความเร็วอยุ่เท่าไหร ครับ,enquire,package
2,ตะกี้ไปชำระค่าบริการไปแล้ว แต่ยังใช้งานไม่ได้...,report,suspend
3,พี่ค่ะยังใช้ internet ไม่ได้เลยค่ะ เป็นเครื่อ...,enquire,internet
4,ฮาโหล คะ พอดีว่าเมื่อวานเปิดซิมทรูมูฟ แต่มันโ...,report,phone_issues


Unnamed: 0,Sentence Utterance,Action,Object
count,16175,16175,16175
unique,13389,10,33
top,บริการอื่นๆ,enquire,service
freq,97,10377,2525


In [4]:
data_df = data_df[["Sentence Utterance", "Object"]]
data_df.columns = ['input', 'raw_label']
display(data_df.describe())
display(data_df.raw_label.unique())

Unnamed: 0,input,raw_label
count,16175,16175
unique,13389,33
top,บริการอื่นๆ,service
freq,97,2525


array(['payment', 'package', 'suspend', 'internet', 'phone_issues',
       'service', 'nonTrueMove', 'balance', 'detail', 'bill', 'credit',
       'promotion', 'mobile_setting', 'iservice', 'roaming', 'truemoney',
       'information', 'lost_stolen', 'balance_minutes', 'idd',
       'TrueMoney', 'garbage', 'Payment', 'IDD', 'ringtone', 'Idd',
       'rate', 'loyalty_card', 'contact', 'officer', 'Balance', 'Service',
       'Loyalty_card'], dtype=object)

In [5]:
data_df['clean_label']=data_df['raw_label'].str.lower().copy()
data_df.drop('raw_label', axis=1, inplace=True)
display(data_df.describe())
display(data_df.clean_label.unique())

Unnamed: 0,input,clean_label
count,16175,16175
unique,13389,26
top,บริการอื่นๆ,service
freq,97,2528


array(['payment', 'package', 'suspend', 'internet', 'phone_issues',
       'service', 'nontruemove', 'balance', 'detail', 'bill', 'credit',
       'promotion', 'mobile_setting', 'iservice', 'roaming', 'truemoney',
       'information', 'lost_stolen', 'balance_minutes', 'idd', 'garbage',
       'ringtone', 'rate', 'loyalty_card', 'contact', 'officer'],
      dtype=object)

In [6]:
data_df = data_df.drop_duplicates("input", keep="first")
data_df = data_df.reset_index(drop=True)
display(data_df.describe())

Unnamed: 0,input,clean_label
count,13389,13389
unique,13389,26
top,<PHONE_NUMBER_REMOVED> ผมไปจ่ายเงินที่ Counte...,service
freq,1,2111


In [11]:
data = data_df.copy().to_numpy()

unique_label = data_df.clean_label.unique()

label_2_num_map = dict(zip(unique_label, range(len(unique_label))))
num_2_label_map = dict(zip(range(len(unique_label)), unique_label))

print("Create Mappings")
display(num_2_label_map)
display(label_2_num_map)

# print("Before Mappings")
# display(data[:, 1])
data[:,1] = np.vectorize(label_2_num_map.get)(data[:,1])
# print("After Mappings")
# display(data[:, 1])

Create Mappings


{0: 'payment',
 1: 'package',
 2: 'suspend',
 3: 'internet',
 4: 'phone_issues',
 5: 'service',
 6: 'nontruemove',
 7: 'balance',
 8: 'detail',
 9: 'bill',
 10: 'credit',
 11: 'promotion',
 12: 'mobile_setting',
 13: 'iservice',
 14: 'roaming',
 15: 'truemoney',
 16: 'information',
 17: 'lost_stolen',
 18: 'balance_minutes',
 19: 'idd',
 20: 'garbage',
 21: 'ringtone',
 22: 'rate',
 23: 'loyalty_card',
 24: 'contact',
 25: 'officer'}

{'payment': 0,
 'package': 1,
 'suspend': 2,
 'internet': 3,
 'phone_issues': 4,
 'service': 5,
 'nontruemove': 6,
 'balance': 7,
 'detail': 8,
 'bill': 9,
 'credit': 10,
 'promotion': 11,
 'mobile_setting': 12,
 'iservice': 13,
 'roaming': 14,
 'truemoney': 15,
 'information': 16,
 'lost_stolen': 17,
 'balance_minutes': 18,
 'idd': 19,
 'garbage': 20,
 'ringtone': 21,
 'rate': 22,
 'loyalty_card': 23,
 'contact': 24,
 'officer': 25}

In [12]:
def strip_str(string):
    return string.strip()
     
# print("Before")
# print(data)
data[:,0] = np.vectorize(strip_str)(data[:,0])
# print("After")
# print(data)

# Model 2 MUSE

Build a simple logistic regression model using features from the MUSE model.

Which MUSE model will you use? Why?

**Ans:**
`sentence-transformers/use-cmlm-multilingual` (the first link) simply because it is more popular and has more downloads. Thus, it is maintained more frequently and has more support.

MUSE is typically used with tensorflow. However, there are some pytorch conversions made by some people.

- https://huggingface.co/sentence-transformers/use-cmlm-multilingual
- https://huggingface.co/dayyass/universal-sentence-encoder-multilingual-large-3-pytorch

In [9]:
import time
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [13]:
data

array([['<PHONE_NUMBER_REMOVED> ผมไปจ่ายเงินที่ Counter Services เค้าเช็ต 3276.25 บาท เมื่อวานที่ผมเช็คที่ศูนย์บอกมียอด 3057.79 บาท',
        0],
       ['internet ยังความเร็วอยุ่เท่าไหร ครับ', 1],
       ['ตะกี้ไปชำระค่าบริการไปแล้ว แต่ยังใช้งานไม่ได้ ค่ะ', 2],
       ...,
       ['ยอดเงินเหลือเท่าไหร่ค่ะ', 7],
       ['ยอดเงินในระบบ', 7],
       ['สอบถามโปรโมชั่นปัจจุบันที่ใช้อยู่ค่ะ', 1]], dtype=object)

In [15]:
sentences = data[:,0]
len(sentences)

13389

In [16]:
sentences = data[:,0]
model = SentenceTransformer('sentence-transformers/use-cmlm-multilingual')
embeddings = model.encode(sentences, show_progress_bar=True)

Some weights of the model checkpoint at sentence-transformers/use-cmlm-multilingual were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/419 [00:00<?, ?it/s]

In [19]:
print(embeddings.shape)

(13389, 768)


In [21]:
X = embeddings.copy()
y = data[:, 1]

print(X.shape, y.shape)

(13389, 768) (13389,)


In [22]:
X_train: np.ndarray
X_test: np.ndarray
y_train: np.ndarray
y_test: np.ndarray

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2,random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(10711, 768) (2678, 768) (10711,) (2678,)


In [24]:
X_train: np.ndarray
X_test: np.ndarray
y_train: np.ndarray
y_test: np.ndarray

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2,random_state=42)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(10711, 768) (2678, 768) (10711,) (2678,)


In [25]:
# 100 max epochs took 36.6s
logmodel = LogisticRegression(class_weight='balanced', max_iter=100, random_state=2025)
logmodel.fit(X_train, y_train)

In [26]:
start_time = time.time()
predictions = logmodel.predict(X_test)
end_time = time.time()

print(f"Total time: {end_time - start_time}")

Total time: 0.16835498809814453


In [27]:
# 100 max epochs acc 64.152%
print("Model Acc. on test data %f%%"
       % ((y_test == predictions).sum() / y_test.shape[0] * 100))

Model Acc. on test data 64.152353%


In [244]:
report = classification_report(y_test, predictions, output_dict=True, digits=2)
report

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'0': {'precision': 0.572289156626506,
  'recall': 0.7421875,
  'f1-score': 0.6462585034013606,
  'support': 128.0},
 '1': {'precision': 0.7096774193548387,
  'recall': 0.55,
  'f1-score': 0.6197183098591549,
  'support': 360.0},
 '2': {'precision': 0.7295597484276729,
  'recall': 0.7945205479452054,
  'f1-score': 0.760655737704918,
  'support': 146.0},
 '3': {'precision': 0.6873065015479877,
  'recall': 0.6201117318435754,
  'f1-score': 0.6519823788546255,
  'support': 358.0},
 '4': {'precision': 0.5029239766081871,
  'recall': 0.7413793103448276,
  'f1-score': 0.5993031358885017,
  'support': 116.0},
 '5': {'precision': 0.8379310344827586,
  'recall': 0.5758293838862559,
  'f1-score': 0.6825842696629213,
  'support': 422.0},
 '6': {'precision': 0.359375,
  'recall': 0.46938775510204084,
  'f1-score': 0.40707964601769914,
  'support': 49.0},
 '7': {'precision': 0.8857142857142857,
  'recall': 0.7306397306397306,
  'f1-score': 0.8007380073800738,
  'support': 297.0},
 '8': {'precision'