### Have being facing challenges regarding training CNN and other DL models. Want to rule out if a particular loss fucntion plays the role in making binary classification very difficult.

## 1. Load libraries

In [1]:
#Import the necessary libraries
import pandas as pd
import numpy as np
import scipy as sp
import sys, nltk, re, bs4, sklearn, matplotlib
from bs4 import BeautifulSoup
import datasets, transformers 
from copy import deepcopy
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download("stopwords")
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#import gensim
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report

## 2. Get data

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
youtube_spam_collection = fetch_ucirepo(id=380) 
  
# data (as pandas dataframes) 
X = youtube_spam_collection.data.features 
y = youtube_spam_collection.data.targets 
  
# metadata 
print(youtube_spam_collection.metadata) 
  
# variable information 
print(youtube_spam_collection.variables) 


{'uci_id': 380, 'name': 'YouTube Spam Collection', 'repository_url': 'https://archive.ics.uci.edu/dataset/380/youtube+spam+collection', 'data_url': 'https://archive.ics.uci.edu/static/public/380/data.csv', 'abstract': 'It is a public set of comments collected for spam research. It has five datasets composed by 1,956 real messages extracted from five videos that were among the 10 most viewed on the collection period.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Text'], 'num_instances': 1956, 'num_features': 3, 'feature_types': [], 'demographics': [], 'target_col': ['CLASS'], 'index_col': ['VIDEO', 'COMMENT_ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Wed Apr 03 2024', 'dataset_doi': '10.24432/C58885', 'creators': ['T.C. Alberto', 'J.V. Lochter'], 'intro_paper': None, 'additional_info': {'summary': 'The table below lists the datasets, the YouTube video ID, the amount of samples in ea

## 4. Data Cleaning

In [3]:
nltk.download('punkt_tab')
# We want to remove URL as comment as it will confuse the model but we will keep a flag that a url watch present here, 
#as it might be a strong indicator of a spam comment

X_all = [re.sub(r'http\S+|www\S+|https\S+', 'url', i, flags=re.MULTILINE) for i in X['CONTENT']]

# Similarly for watch, we will remove complext suffix but keep the word watch as an input to out model
X_all = [re.sub(r'watch\?v=\S+', 'watch', i, flags=re.MULTILINE) for i in X_all]

# Remove any html tags by mistake
X_all = [BeautifulSoup(i, "html.parser").get_text() for i in X_all]

# for any emoji or emoticon replace it with the word "emoji" as it can be a useful feature
X_all = [re.sub("[:;][)|(DP]","emoji",i) for i in X_all]

# any number does not look like year should be replaced with an identifier number

def not_number(string):
    string = re.sub(",","",string)
    try:
        float(string)
        return False
    except ValueError:
        return True
    
def check_range(string_no):
    string_no = re.sub(",","",string_no)
    if(float(string_no) <= 2100.0 and float(string_no) >= 1800.0):
        return(True)
    else:
        return(False)

def remove_suprious_num(snt):
    token_words = word_tokenize(snt)
    changed_words = [i if not_number(i) else "year" if  check_range(i) else "large number" for i in token_words]
    return(" ".join(changed_words))

X_all = [remove_suprious_num(i) for i in X_all]

# convert all to lower case
X_all = [i.casefold() for i in X_all]

# romve extra white space 
X_all = [re.sub(r'[^\w\s]', '', i) for i in X_all]

X_all = [re.sub(r'\s+', ' ', i).strip() for i in X_all]



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ashutosh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  X_all = [BeautifulSoup(i, "html.parser").get_text() for i in X_all]


## 4. Split into test and train data

In [4]:
# we will use y to stratify so the distribution of labels is simialr in training and test data
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
print(stopwords)
stop_words = set(stopwords.words("english"))
#stop_words

<WordListCorpusReader in '.../corpora/stopwords' (not loaded yet)>


In [6]:
def remove_stop_words(sent):
    token_words = word_tokenize(sent)
    clean_token_words = [i for i in token_words if i not in stop_words]
    return(" ".join(clean_token_words))

In [7]:
X_train = [remove_stop_words(i) for i in X_train]
X_test = [remove_stop_words(i) for i in X_test]
X_train[0:10]

['video large number views large number million people earth',
 'head like large number years ago time flies',
 'large number billions year',
 'wtf subscribe channel thanx emoji',
 'omg',
 'please become first subscriber thank',
 'everyone come check new gta large number gameplay right watch',
 'remember back popular everyone school shuffling crazy',
 'url please halp project',
 'haha funny see salt westerners top views youtube goes video dont even understand keep salt']

### 5b. Normalization - Lemmization

In [8]:
def normalize_data(sent,lemmitizer):
    token_words = word_tokenize(sent)
    clean_tokens = [lemmitizer.lemmatize(i) for i in token_words]
    return(" ".join(clean_tokens)) 

lemmitizer = WordNetLemmatizer()
X_train = [normalize_data(i,lemmitizer) for i in X_train]
X_test = [normalize_data(i,lemmitizer) for i in X_test]
X_train[0:20]

['video large number view large number million people earth',
 'head like large number year ago time fly',
 'large number billion year',
 'wtf subscribe channel thanx emoji',
 'omg',
 'please become first subscriber thank',
 'everyone come check new gta large number gameplay right watch',
 'remember back popular everyone school shuffling crazy',
 'url please halp project',
 'haha funny see salt westerner top view youtube go video dont even understand keep salt',
 'believe jesus christ savior sin truly believe jesus christ savior sin go heaven believe jesus christ saved salvation gained god righteousness matter much sinned past present especially future believe jesus christ savior go heaven forever whole truth spread truth',
 'music hero',
 'check berzerk video channel emoji',
 'check app solve partydrunk problem href url',
 'love song make wan na dance',
 'love song',
 'check video youtube',
 'check video youtube href url eminem ft rihanna love way lie',
 'absolutely adore watching foo

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

# fit data to tfidf
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)


### Training dataset


In [10]:
X_tensor_train = torch.tensor(train_tfidf.toarray(), dtype=torch.float32)    
print(X_tensor_train.shape)

y_tensor_train = torch.tensor(np.array(y_train), dtype=torch.float32)

dataset_train = TensorDataset(X_tensor_train, y_tensor_train)
dataloader_train = DataLoader(dataset_train, batch_size=10, shuffle=False,drop_last=False)

torch.Size([1564, 2957])


### Testing dataset

In [11]:
X_tensor_test = torch.tensor(test_tfidf.toarray(), dtype=torch.float32)    
print(X_tensor_test.shape)
y_tensor_test = torch.tensor(np.array(y_test), dtype=torch.float32)

dataset_test = TensorDataset(X_tensor_test, y_tensor_test)
dataloader_test = DataLoader(dataset_test, batch_size=10, shuffle=False,drop_last=False)

torch.Size([392, 2957])


# Simple ANN

# 1. Sigmoid and BCE Loss

In [12]:
from torch import nn

# Define model
class SpamFilter(nn.Module):
    def __init__(self, input_size):
        super(SpamFilter, self).__init__()
        self.layer1 = nn.Linear(in_features=input_size,
                               out_features=128)
        
        self.layer2 = nn.Linear(in_features=128,
                               out_features=64 )
        
        self.layer3 = nn.Linear(in_features=64,
                               out_features=1 )
        
    def forward(self, x):
        out = torch.relu(self.layer1(x))
        out = torch.relu(self.layer2(out))
        out = torch.sigmoid(self.layer3(out))
        return out

In [13]:
# Initialize the model
input_size = train_tfidf.shape[1]
model0 = SpamFilter(input_size)

In [14]:
# loss function
criterion = nn.BCELoss()

# optimizer
optimizer = torch.optim.Adam(params=model0.parameters(),
                            lr=0.001)

In [15]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model0.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = model0(inputs)
        loss = criterion(outputs, labels)        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
     
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}')

Epoch 1/10, Loss: 72.09758307412267
Epoch 2/10, Loss: 14.634043395984918
Epoch 3/10, Loss: 5.122163700521924
Epoch 4/10, Loss: 2.1478996443911456
Epoch 5/10, Loss: 1.0356194037303794
Epoch 6/10, Loss: 0.5053162984258961
Epoch 7/10, Loss: 0.2893129837138986
Epoch 8/10, Loss: 0.18719634010631125
Epoch 9/10, Loss: 0.1258652334827275
Epoch 10/10, Loss: 0.08948239286382886


In [16]:
# Evaluation
model0.eval()

correct = 0
total = 0
train_preds = []

with torch.inference_mode():
    for inputs, labels in dataloader_train:
        outputs = model0(inputs)
        labels = np.array(labels.squeeze(dim=1))
        predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item[0] for item in predicted])
        total += len(labels)
        
        cor = predicted == labels
        train_preds_temp =  [float(b) for b in predicted]
        train_preds.extend(train_preds_temp)
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 100.00%


In [17]:
# Evaluation
model0.eval()

correct = 0
total = 0

test_preds = []

with torch.no_grad():
    for inputs, labels in dataloader_test:
        outputs = model0(inputs)
        labels = np.array(labels.squeeze(dim=1))
        predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item[0] for item in predicted])
        total += len(labels)
        
        cor = predicted == labels
        
        test_preds_temp =  [float(b) for b in predicted]
        test_preds.extend(test_preds_temp)
        
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 92.60%


In [18]:
# Training accuracy
ann_train_accuracy = accuracy_score(y_train, train_preds)
# Testing accuracy
ann_test_accuracy = accuracy_score(y_test, test_preds)

ann_train_precision = precision_score(y_train, train_preds)
ann_test_precision = precision_score(y_test, test_preds)

ann_train_recall = recall_score(y_train, train_preds)
ann_test_recall = recall_score(y_test, test_preds)

print("training accuracy is {0}".format(round(ann_train_accuracy,3)))
print("test accuracy is {0}".format(round(ann_test_accuracy,3)))

print("training precision is {0}".format(round(ann_train_precision,3)))
print("test precision is {0}".format(round(ann_test_precision,3)))

print("training recall is {0}".format(round(ann_train_recall,3)))
print("test recall is {0}".format(round(ann_test_recall,3)))

print(classification_report(y_test, test_preds))

training accuracy is 1.0
test accuracy is 0.926
training precision is 1.0
test precision is 0.922
training recall is 1.0
test recall is 0.935
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       191
           1       0.92      0.94      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93       392
weighted avg       0.93      0.93      0.93       392



# 2. BCEWithLogitsLoss

In [19]:
del total,
del train_preds,
del accuracy,
del ann_train_accuracy
del ann_test_accuracy ,
del ann_train_precision, 
del ann_test_precision,
del ann_train_recall ,
del ann_test_recall,
del running_loss,
del optimizer,
del criterion
del model0,correct

In [20]:
from torch import nn

# Define model
class SpamFilter2(nn.Module):
    def __init__(self, input_size):
        super(SpamFilter2, self).__init__()
        self.layer1 = nn.Linear(in_features=input_size,
                               out_features=128)
        
        self.layer2 = nn.Linear(in_features=128,
                               out_features=64 )
        
        self.layer3 = nn.Linear(in_features=64,
                               out_features=1 )
        
    def forward(self, x):
        out = torch.relu(self.layer1(x))
        out = torch.relu(self.layer2(out))
        out = self.layer3(out)
        return out

In [21]:
model0 = SpamFilter2(input_size)

# loss function
criterion = nn.BCEWithLogitsLoss()

# optimizer
optimizer = torch.optim.Adam(params=model0.parameters(),
                            lr=0.001)

In [22]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model0.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = model0(inputs)
        loss = criterion(outputs, labels)        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
     
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}')

Epoch 1/10, Loss: 71.21296524628997
Epoch 2/10, Loss: 14.409506441093981
Epoch 3/10, Loss: 5.220782635151409
Epoch 4/10, Loss: 2.0344197752710897
Epoch 5/10, Loss: 0.9388499096094165
Epoch 6/10, Loss: 0.4932650767368614
Epoch 7/10, Loss: 0.3007843071682146
Epoch 8/10, Loss: 0.20007469480697182
Epoch 9/10, Loss: 0.13719925464101834
Epoch 10/10, Loss: 0.09928184779982985


In [23]:
# Evaluation
model0.eval()

correct = 0
total = 0
train_preds = []

with torch.inference_mode():
    for inputs, labels in dataloader_train:
        logits = model0(inputs)
        outputs = torch.sigmoid(logits)
        labels = np.array(labels.squeeze(dim=1))
        predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item[0] for item in predicted])
        total += len(labels)
        
        cor = predicted == labels
        train_preds_temp =  [float(b) for b in predicted]
        train_preds.extend(train_preds_temp)
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 100.00%


In [24]:
# Evaluation
model0.eval()

correct = 0
total = 0

test_preds = []

with torch.no_grad():
    for inputs, labels in dataloader_test:
        logits = model0(inputs)
        outputs = torch.sigmoid(logits)
        labels = np.array(labels.squeeze(dim=1))
        predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item[0] for item in predicted])
        total += len(labels)
        
        cor = predicted == labels
        
        test_preds_temp =  [float(b) for b in predicted]
        test_preds.extend(test_preds_temp)
        
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 92.60%


In [25]:
# Training accuracy
ann_train_accuracy = accuracy_score(y_train, train_preds)
# Testing accuracy
ann_test_accuracy = accuracy_score(y_test, test_preds)

ann_train_precision = precision_score(y_train, train_preds)
ann_test_precision = precision_score(y_test, test_preds)

ann_train_recall = recall_score(y_train, train_preds)
ann_test_recall = recall_score(y_test, test_preds)

print("training accuracy is {0}".format(round(ann_train_accuracy,3)))
print("test accuracy is {0}".format(round(ann_test_accuracy,3)))

print("training precision is {0}".format(round(ann_train_precision,3)))
print("test precision is {0}".format(round(ann_test_precision,3)))

print("training recall is {0}".format(round(ann_train_recall,3)))
print("test recall is {0}".format(round(ann_test_recall,3)))

print(classification_report(y_test, test_preds))

training accuracy is 1.0
test accuracy is 0.926
training precision is 1.0
test precision is 0.917
training recall is 1.0
test recall is 0.94
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       191
           1       0.92      0.94      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93       392
weighted avg       0.93      0.93      0.93       392



# Cross Entropy Loss

In [26]:
del total,
del train_preds,
del accuracy,
del ann_train_accuracy
del ann_test_accuracy ,
del ann_train_precision, 
del ann_test_precision,
del ann_train_recall ,
del ann_test_recall,
del running_loss,
del optimizer,
del criterion
del model0,correct

## Labels for Cross Entropy should be long not float

In [27]:
X_tensor_train = torch.tensor(train_tfidf.toarray(), dtype=torch.float32)    
print(X_tensor_train.shape)

y_tensor_train = torch.tensor(np.array(y_train), dtype=torch.long)

dataset_train = TensorDataset(X_tensor_train, y_tensor_train)
dataloader_train = DataLoader(dataset_train, batch_size=10, shuffle=False,drop_last=False)

X_tensor_test = torch.tensor(test_tfidf.toarray(), dtype=torch.float32)    
print(X_tensor_test.shape)
y_tensor_test = torch.tensor(np.array(y_test), dtype=torch.long)

dataset_test = TensorDataset(X_tensor_test, y_tensor_test)
dataloader_test = DataLoader(dataset_test, batch_size=10, shuffle=False,drop_last=False)

torch.Size([1564, 2957])
torch.Size([392, 2957])


In [28]:
from torch import nn

# Define model
class SpamFilter3(nn.Module):
    def __init__(self, input_size):
        super(SpamFilter3, self).__init__()
        self.layer1 = nn.Linear(in_features=input_size,
                               out_features=128)
        
        self.layer2 = nn.Linear(in_features=128,
                               out_features=64 )
        
        self.layer3 = nn.Linear(in_features=64,
                               out_features=2 )
        
    def forward(self, x):
        out = torch.relu(self.layer1(x))
        out = torch.relu(self.layer2(out))
        out = self.layer3(out)
        return out

In [29]:
model0 = SpamFilter3(input_size)

# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.Adam(params=model0.parameters(),
                            lr=0.001)

In [30]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model0.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = model0(inputs)
        #print(outputs)
        #print(labels.squeeze(1).shape)
        loss = criterion(outputs, labels.squeeze(1))        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
     
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}')

Epoch 1/10, Loss: 65.81644764356315
Epoch 2/10, Loss: 12.661946819978766
Epoch 3/10, Loss: 4.234536151168868
Epoch 4/10, Loss: 1.6042365015746327
Epoch 5/10, Loss: 0.7014152535411995
Epoch 6/10, Loss: 0.3257722453381575
Epoch 7/10, Loss: 0.18567115622317942
Epoch 8/10, Loss: 0.11333005322194367
Epoch 9/10, Loss: 0.07111655594144395
Epoch 10/10, Loss: 0.04413262372054305


In [31]:
# Evaluation
model0.eval()

correct = 0
total = 0
train_preds = []

with torch.inference_mode():
    for inputs, labels in dataloader_train:
        logits = model0(inputs)
        #print(logits)
        outputs = torch.argmax(logits, dim=1)
        #print(outputs)
        labels = np.array(labels.squeeze(dim=1))
        #predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item for item in np.array(outputs)])
        total += len(labels)
        
        cor = predicted == labels
        train_preds_temp =  [float(b) for b in predicted]
        train_preds.extend(train_preds_temp)
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 100.00%


In [32]:
# Evaluation
model0.eval()

correct = 0
total = 0

test_preds = []

with torch.no_grad():
    for inputs, labels in dataloader_test:
        #print(inputs)
        logits = model0(inputs)
        #print(logits)
        outputs = torch.argmax(logits,dim= 1)
        #print("this output")
        #print(outputs)
        labels = np.array(labels.squeeze(dim=1))
        #print(outputs)
        #predicted = np.array((outputs > 0.5).float())
        predicted = np.array([item for item in np.array(outputs)])
        total += len(labels)
        
        cor = predicted == labels
        
        test_preds_temp =  [float(b) for b in predicted]
        test_preds.extend(test_preds_temp)
        
        correct += cor.sum()

accuracy = correct / total
print('Test Accuarcy: {:.2f}%'.format(100 * accuracy))

Test Accuarcy: 93.11%


In [33]:
# Training accuracy
ann_train_accuracy = accuracy_score(y_train, train_preds)
# Testing accuracy
ann_test_accuracy = accuracy_score(y_test, test_preds)

ann_train_precision = precision_score(y_train, train_preds)
ann_test_precision = precision_score(y_test, test_preds)

ann_train_recall = recall_score(y_train, train_preds)
ann_test_recall = recall_score(y_test, test_preds)

print("training accuracy is {0}".format(round(ann_train_accuracy,3)))
print("test accuracy is {0}".format(round(ann_test_accuracy,3)))

print("training precision is {0}".format(round(ann_train_precision,3)))
print("test precision is {0}".format(round(ann_test_precision,3)))

print("training recall is {0}".format(round(ann_train_recall,3)))
print("test recall is {0}".format(round(ann_test_recall,3)))

print(classification_report(y_test, test_preds))

training accuracy is 1.0
test accuracy is 0.931
training precision is 1.0
test precision is 0.922
training recall is 1.0
test recall is 0.945
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       191
           1       0.92      0.95      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93       392
weighted avg       0.93      0.93      0.93       392

