# Natural Language Processing - Classification

## Libraries

In [1]:
#@title Installation
!pip3 install PySastrawi
!pip3 install transformers
!pip3 install stopwords

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[?25l[K     |█▋                              | 10 kB 15.5 MB/s eta 0:00:01[K     |███▏                            | 20 kB 21.7 MB/s eta 0:00:01[K     |████▊                           | 30 kB 12.2 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 9.6 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 5.2 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 5.7 MB/s eta 0:00:01[K     |███████████                     | 71 kB 6.1 MB/s eta 0:00:01[K     |████████████▌                   | 81 kB 6.8 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 6.5 MB/s eta 0:00:01[K     |███████████████▋                | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████████▏              | 112 kB 5.4 MB/s eta 0:00:01[K     |██████████████████▊             | 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████████▎           | 133 kB 5.4 MB/s eta 0

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import TFBertForSequenceClassification, BertTokenizer 

from sklearn.metrics import confusion_matrix, classification_report 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Cleaning

In [3]:
train = pd.read_csv('sms_spam_train.csv')
test = pd.read_csv('sms_spam_test.csv')

In [4]:
#@title Regex
import string
from string import digits
import re

REGEX_URL = re.compile(
    r"(?:^|(?<![\w\/\.]))"
    # protocol identifier
    # r"(?:(?:https?|ftp)://)"
    r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host name
    r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
    # domain name
    r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:\/[^\)\]\}\s]*)?",
    # r"(?:$|(?![\w?!+&\/\)]))",
    flags=re.UNICODE | re.IGNORECASE,
)

REGEX_NUMBER =re.compile(
    r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
)

def replace(text, regex_pattern, replacement):
  return regex_pattern.sub(replacement, text)

def replace_urls(text, symbol = "<URL>"):
  return replace(text, REGEX_URL, symbol)

def replace_numbers(text, symbol = "<NUM>") -> str:
  return replace(text, REGEX_NUMBER, symbol)

def clean_numbers(text):
  return text.translate({ord(k): None for k in digits})

def replace_punctuations(
        text, symbol = "<PUNCT>", exceptions= []
    ):
    all_punct = [punct for punct in string.punctuation]
    punct_to_remove = "".join(
        [punct for punct in all_punct if punct not in exceptions]
    )
    result = re.sub(r"[" + str(punct_to_remove) + "]+\ *", " " + symbol + " ", text)

    return result

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

def replace_words_by_dictionary(text, dictionary):
    new_text = []
    for word in text.split():
        if word in dictionary:
            new_text.append(dictionary[word])
        else:
            new_text.append(word)

    return " ".join(new_text)

def stem(text):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()

  output   = stemmer.stem(text)

  return output

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def preprocess(text):

  # \n
  text = re.sub('\n', ' ', text)

  # Username
  text = re.sub('@[^\s]+','', text)
  
  # Lowercase
  text = text.lower()

  # Replace URL with tag
  text = replace_urls(text)

  # Remove punctuation
  text = replace_punctuations(text,  symbol="", exceptions=["<", ">"])

  # Replace numbers with tag
  text = replace_numbers(text)

  text = clean_numbers(text)

  text = remove_emojis(text)

  return text

In [5]:
train['Teks'] = train['Teks'].apply(preprocess)
test['Teks'] = test['Teks'].apply(preprocess)

In [6]:
train.head()

Unnamed: 0,Teks,label
0,anda akan membeli paket gampang internetan rp ...,2
1,gaada buat yg bukan temen mah,0
2,nama itu msh pake db aku yg lama itu msh anoma...,0
3,urang mah kayanya dah jumatan tapi jam an ma...,0
4,eh gatau td bagus yang nanyain,0


In [7]:
test.head()

Unnamed: 0,Teks,label
0,pakai xl tdk perlu repot setting bisa langsung...,2
1,terimakasih paket <NUM> mb anda telah aktif ...,0
2,gratis internetan mb berlaku utk <NUM> hari ha...,1
3,togel sgp edisi senin <NUM> <NUM> <NUM> angk...,1
4,mega prima tama promo discon <NUM> <NUM> bla...,1


## Training

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, X_val = train_test_split(train['Teks'], train['label'], test_size=0.2, random_state=42)

In [9]:
X_test = test['Teks']
y_test = test['label']

### Machine Learning



In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#### Naive Bayes

In [11]:
text_naive = Pipeline([
                       ('vectorizer', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('multinomial', MultinomialNB()),
])

In [12]:
text_naive.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('multinomial',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
naive_pred = text_naive.predict(X_test)
np.mean(naive_pred == y_test)

0.9301310043668122

In [14]:
CLASSES=["Normal", "Fraud", "Promo"]
from sklearn import metrics
print(metrics.classification_report(y_test, naive_pred, target_names=CLASSES))

              precision    recall  f1-score   support

      Normal       0.96      0.93      0.94        99
       Fraud       0.97      0.95      0.96        82
       Promo       0.81      0.90      0.85        48

    accuracy                           0.93       229
   macro avg       0.91      0.93      0.92       229
weighted avg       0.93      0.93      0.93       229



#### Logistic Regression

In [15]:
text_log = Pipeline([
                     ('vectorizer', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('log', LogisticRegression()),
])

In [16]:
text_log.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...ne)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('log',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1

In [17]:
log_pred = text_log.predict(X_test)
np.mean(log_pred == y_test)

0.9082969432314411

In [18]:
CLASSES=["Normal", "Fraud", "Promo"]
from sklearn import metrics
print(metrics.classification_report(y_test, log_pred, target_names=CLASSES))

              precision    recall  f1-score   support

      Normal       0.93      0.94      0.93        99
       Fraud       0.96      0.90      0.93        82
       Promo       0.79      0.85      0.82        48

    accuracy                           0.91       229
   macro avg       0.89      0.90      0.90       229
weighted avg       0.91      0.91      0.91       229



#### SVM

In [19]:
text_clf = Pipeline([
                     ('vectorizer', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('scaler', StandardScaler(with_mean=False)),
                     ('svm_clf', SVC(kernel='rbf', gamma=5, C=0.001))
])

In [20]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('scaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('svm_clf',
                 SVC(C=0.001, break_ties=False, cache_size=200,
        

In [21]:
clf_pred = text_clf.predict(X_test)
np.mean(clf_pred == y_test)

0.43231441048034935

In [22]:
CLASSES=["Normal", "Fraud", "Promo"]
from sklearn import metrics
print(metrics.classification_report(y_test, clf_pred, target_names=CLASSES))

              precision    recall  f1-score   support

      Normal       0.43      1.00      0.60        99
       Fraud       0.00      0.00      0.00        82
       Promo       0.00      0.00      0.00        48

    accuracy                           0.43       229
   macro avg       0.14      0.33      0.20       229
weighted avg       0.19      0.43      0.26       229



  _warn_prf(average, modifier, msg_start, len(result))


#### SGD Classifier

In [23]:
text_sgd = Pipeline([
                     ('vectorizer', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())
])

In [24]:
text_sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [25]:
sgd_pred = text_sgd.predict(X_test)
np.mean(sgd_pred == y_test)

0.9213973799126638

In [26]:
CLASSES=["Normal", "Fraud", "Promo"]
from sklearn import metrics
print(metrics.classification_report(y_test, sgd_pred, target_names=CLASSES))

              precision    recall  f1-score   support

      Normal       0.96      0.92      0.94        99
       Fraud       0.96      0.91      0.94        82
       Promo       0.80      0.94      0.87        48

    accuracy                           0.92       229
   macro avg       0.91      0.92      0.91       229
weighted avg       0.93      0.92      0.92       229



### BERT

In [27]:
tf_train = tf.data.experimental.CsvDataset(['/content/sms_spam_train.csv'], 
                                            record_defaults = [tf.constant([""], dtype=tf.string), tf.constant([0], dtype=tf.int64)],
                                            header=True) 
tf_test = tf.data.experimental.CsvDataset(['/content/sms_spam_test.csv'], 
                                           record_defaults = [tf.constant([""], dtype=tf.string), tf.constant([0], dtype=tf.int64)],
                                           header=True) 

In [28]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-lite-base-p2', do_lower_case=True)
model = TFBertForSequenceClassification.from_pretrained('indobenchmark/indobert-lite-base-p2', num_labels=3)

Downloading:   0%|          | 0.00/220k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/60.1M [00:00<?, ?B/s]

Some layers from the model checkpoint at indobenchmark/indobert-lite-base-p2 were not used when initializing TFBertForSequenceClassification: ['predictions', 'sop_classifier', 'albert']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-lite-base-p2 and are newly initialized: ['bert', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
#@title Sentence to Features
MAX_LENGTH = 65
def convert_sentence_to_features(sentence):
  return tokenizer.encode_plus(
                        sentence,       
                        max_length = MAX_LENGTH, # max length of the text that can go to BERT [Step 2]            
                        add_special_tokens = True, # add special tokens  [Step 1]
                        pad_to_max_length = True, # add [PAD] tokens [Step 3]
                        return_attention_mask = True, # add attention mask to not focus on pad tokens, [Step 4]
                        truncation=True # Truncate to a maximum length specified with the argument max_length
              )
  
def map_features_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode_sentences(dataset):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
    
  for message, label in tfds.as_numpy(dataset):
    bert_input = convert_sentence_to_features(message.decode())
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_features_to_dict)

In [30]:
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 10000

train_encoded = encode_sentences(tf_train).shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_encoded = encode_sentences(tf_test).batch(BATCH_SIZE)



In [31]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [32]:
history = model.fit(train_encoded,
                    epochs=10,
                    validation_data=test_encoded)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
y_pred = model.predict(test_encoded)[0].argmax(axis=-1)

In [35]:
print(classification_report(y_test, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

      Normal       0.95      0.94      0.94        99
       Fraud       0.90      0.93      0.92        82
       Promo       0.81      0.79      0.80        48

    accuracy                           0.90       229
   macro avg       0.89      0.89      0.89       229
weighted avg       0.90      0.90      0.90       229

