# TBA 3102 - Text Analytics
## Practical Lab 07 - Text Classification
### Question 3 - Intermediate Text Classification
Student: Nicky Ng <br>
GitHub User: [ahjimomo](https://github.com/ahjimomo) <br>
Student Number: A0194330L

In [1]:
## Libraries
# Data Wrangling
import numpy as np
import pandas as pd
from collections import Counter

# Data Preparation
from sklearn.model_selection import train_test_split
import nltk

# Feature Engineering
from sklearn.feature_extraction.text import CountVectorizer # Bow
from sklearn.feature_extraction.text import TfidfVectorizer # Tf-IDf
from gensim.models import word2vec                          # Embeddings

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# Cross-Validation & Model Evaluation
from sklearn.model_selection import cross_val_score
import model_evaluation_utils as meu
from sklearn import metrics

# Visualization
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Random_state
random_state = 42

In [2]:
# Import data
raw_df = pd.read_csv('./data/sms_cleaned.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5559 entries, 0 to 5558
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Label            5559 non-null   object
 1   SMSText          5559 non-null   object
 2   Cleaned_SMSText  5559 non-null   object
dtypes: object(3)
memory usage: 130.4+ KB


In [3]:
# Create label number
raw_df['Label_no'] = np.where(raw_df['Label'] == 'ham', 1, 0)
raw_df['Label_no'].unique()

array([1, 0])

In [4]:
# Split training & testing data
x_train_corpus, x_test_corpus, y_train_label_nums, y_test_label_nums, y_train_label_names, y_test_label_names = train_test_split(np.array(raw_df['Cleaned_SMSText']),
                                                                                                                        np.array(raw_df['Label_no']),
                                                                                                                        np.array(raw_df['Label']),
                                                                                                                        test_size = 0.20, 
                                                                                                                        stratify = raw_df['Label_no'],
                                                                                                                        shuffle = True,
                                                                                                                        random_state = random_state)

print(f"Size of training corpus: {len(x_train_corpus)}\nSize of testing corpus: {len(x_test_corpus)}\nTotal: {len(raw_df)}")

Size of training corpus: 4447
Size of testing corpus: 1112
Total: 5559


## Feature Engineering

In [5]:
# 1. Term Frequency Features
tf_cv = CountVectorizer(binary = False, min_df = 0.0, max_df = 1.0) 

# transform train & test data into features
tf_train_features = tf_cv.fit_transform(x_train_corpus)
tf_test_features = tf_cv.transform(x_test_corpus)

print('Term Frequency BOW model:> Train features shape:', tf_train_features.shape, ' Test features shape:', tf_test_features.shape)

Term Frequency BOW model:> Train features shape: (4447, 4516)  Test features shape: (1112, 4516)


In [6]:
# 2. Unigram & Bigram Features
ngram_cv = CountVectorizer(ngram_range=(2,2), min_df = 0.0, max_df = 1.0) 

# transform train & test data into features
ngram_train_features = ngram_cv.fit_transform(x_train_corpus)
ngram_test_features = ngram_cv.transform(x_test_corpus)

print('N-Gram BOW model:> Train features shape:', tf_train_features.shape, ' Test features shape:', tf_test_features.shape)

N-Gram BOW model:> Train features shape: (4447, 4516)  Test features shape: (1112, 4516)


In [7]:
# 3. Tf-IDf Features
tfidf_tv = TfidfVectorizer(use_idf = True, min_df = 0.0, max_df = 1.0)

# transform train & test data into features
tv_train_features = tfidf_tv.fit_transform(x_train_corpus)
tv_test_features = tfidf_tv.transform(x_test_corpus)

print('TF-IDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TF-IDF model:> Train features shape: (4447, 4516)  Test features shape: (1112, 4516)


## Building models

In [8]:
# Final Comparison DataFrame containing all results
model_names = ['mnb_tf', 'mnb_ngram', 'mnb_tfidf', 'lr_tf', 'lr_ngram', 'lr_tfidf', 'svm_tf', 'svm_ngram', 'svm_tfidf']
final_df = pd.DataFrame(index = model_names)
final_df

# Preparation
vectors_lst = ['tf', 'ngram', 'tfidf']
train_features_lst = [tf_train_features, ngram_train_features, tv_train_features]
test_features_lst = [tf_test_features, ngram_test_features, tv_test_features]
unique_classes = list(set(y_test_label_names))

# List to store results
training_acc = []
testing_acc = []
precision = []
recall = []
f1 = []
spam_precision = []
spam_recall = []
spam_f1 = []

In [9]:
# Helper function to iterate through models to build models
def faster_models(models, algo, vector_type, train_features, test_features, train_labels = y_train_label_names, test_labels = y_test_label_names):
   
    # Define string param
    for idx in range(len(vector_type)):
        model = models[idx]
        training_features = train_features[idx]
        testing_features = test_features[idx]
        predict_name = f"{algo}_{vector_type[idx]}_predictions"
        print(predict_name)
        
        # Fit model & get mean training score
        model.fit(training_features, train_labels)
        training_acc.append(np.round(np.mean(cross_val_score(model, training_features, train_labels, cv = 5)), decimals = 2))
        
        # Use model to predict testing data
        predict_name = model.predict(testing_features)
        
        # Extract performances
        # Model
        testing_acc.append(np.round(metrics.accuracy_score(test_labels, predict_name), decimals = 2))
        precision.append(np.round(metrics.precision_score(test_labels, predict_name, average = 'weighted'), decimals = 2))
        recall.append(np.round(metrics.recall_score(test_labels, predict_name, average = 'weighted'), decimals = 2))
        f1.append(np.round(metrics.f1_score(test_labels, predict_name, average = 'weighted'), decimals = 2))
        
        # Spam
        report = metrics.classification_report(y_true = test_labels, y_pred = predict_name, labels = unique_classes, output_dict = True)
        spam_results = list(report['spam'].values())
        spam_precision.append(np.round(spam_results[0], 2))
        spam_recall.append(np.round(spam_results[1], 2))
        spam_f1.append(np.round(spam_results[2], 2))

In [10]:
# Naive Bayes Classifier
mnb_tf = MultinomialNB(alpha = 1)
mnb_ngram = MultinomialNB(alpha = 1)
mnb_tfidf = MultinomialNB(alpha = 1)

models = [mnb_tf, mnb_ngram, mnb_tfidf]

faster_models(models, 'mnb', vectors_lst, train_features_lst, test_features_lst)
print(training_acc)
print(testing_acc)
print(recall)
print(precision)
print(f1)

mnb_tf_predictions
mnb_ngram_predictions
mnb_tfidf_predictions
[0.97, 0.82, 0.95]
[0.98, 0.97, 0.96]
[0.98, 0.97, 0.96]
[0.98, 0.97, 0.96]
[0.98, 0.97, 0.96]


In [11]:
# Logistics Regression
lr_tf = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=random_state, solver='lbfgs')
lr_ngram = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=random_state, solver='lbfgs')
lr_tfidf = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=random_state, solver='lbfgs')

models = [lr_tf, lr_ngram, lr_tfidf]

faster_models(models, 'lr', vectors_lst, train_features_lst, test_features_lst)
print(training_acc)
print(testing_acc)
print(recall)
print(precision)
print(f1)

lr_tf_predictions
lr_ngram_predictions
lr_tfidf_predictions
[0.97, 0.82, 0.95, 0.98, 0.94, 0.95]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96]


In [12]:
# Support Vector Machine (SVM)
svm_tf = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=random_state)
svm_ngram = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=random_state)
svm_tfidf = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=random_state)

models = [svm_tf, svm_ngram, svm_tfidf]

faster_models(models, 'lr', vectors_lst, train_features_lst, test_features_lst)
print(training_acc)
print(testing_acc)
print(recall)
print(precision)
print(f1)

lr_tf_predictions
lr_ngram_predictions
lr_tfidf_predictions
[0.97, 0.82, 0.95, 0.98, 0.94, 0.95, 0.98, 0.95, 0.98]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96, 0.98, 0.96, 0.98]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96, 0.98, 0.96, 0.98]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96, 0.98, 0.96, 0.98]
[0.98, 0.97, 0.96, 0.97, 0.95, 0.96, 0.98, 0.96, 0.98]


In [13]:
# Compile result df
final_df['mean_training_acc'] = training_acc
final_df['testing_acc'] = testing_acc
final_df['recall'] = recall
final_df['precision'] = precision
final_df['f1'] = f1
final_df['spam_recall'] = spam_recall
final_df['spam_precision'] = spam_precision
final_df['spam_f1'] = spam_f1

final_df

Unnamed: 0,mean_training_acc,testing_acc,recall,precision,f1,spam_recall,spam_precision,spam_f1
mnb_tf,0.97,0.98,0.98,0.98,0.98,0.85,0.97,0.91
mnb_ngram,0.82,0.97,0.97,0.97,0.97,0.81,1.0,0.89
mnb_tfidf,0.95,0.96,0.96,0.96,0.96,0.69,1.0,0.82
lr_tf,0.98,0.97,0.97,0.97,0.97,0.83,0.98,0.89
lr_ngram,0.94,0.95,0.95,0.95,0.95,0.64,1.0,0.78
lr_tfidf,0.95,0.96,0.96,0.96,0.96,0.73,0.98,0.84
svm_tf,0.98,0.98,0.98,0.98,0.98,0.86,0.96,0.9
svm_ngram,0.95,0.96,0.96,0.96,0.96,0.7,1.0,0.82
svm_tfidf,0.98,0.98,0.98,0.98,0.98,0.85,0.99,0.92


In [15]:
# Export as csv
final_df.to_csv('./data/lab_7_models_comparison.csv')