# SPAM DETECTION OF YOU-TUBE COMMENTS USING MULTI LAYER PERCEPTRON
### The necessary imports to perform preprocessing and 

In [61]:
import pandas as pd
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### Loading all Datasets

In [47]:
dfPsy = pd.read_csv(r"YouTube-Spam-Collection-v1/Youtube01-Psy.csv")
dfKatyPerry = pd.read_csv(r"YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv")
dfLMFAO = pd.read_csv(r"YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv")
dfEminem = pd.read_csv(r"YouTube-Spam-Collection-v1/Youtube04-Eminem.csv")
dfShakira = pd.read_csv(r"YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")

# Concatinating all the datasets to a single file.
df = pd.concat([dfPsy, dfKatyPerry, dfLMFAO, dfEminem, dfShakira])

### Initializing tokenizer and lemmatizer and writing a function for the preprocessing task

In [48]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def preprocess(col):
    '''
    Preprocessing includes: converting to lowercase, removing punctuation, tokenizing and lemmatizing
    input: text row of dataframe
    output: list with lemmatized words
    '''
    col = col.lower()
    col = [char for char in col if char not in punctuation]
    #rejoin the characters after removing punctuation
    col = ''.join(col)
    #tokenize and add pos tag so lemmatizer doesn't see all words as nouns
    col = nltk.pos_tag(w_tokenizer.tokenize(col))
    #lemmatize with WordNetLemmatizer
    return " ".join([lemmatizer.lemmatize(word, tag[0]) if tag[0] in ['a', 'r', 'n', 'v'] else word for word, tag in col])
#return [lemmatizer.lemmatize(word, tag[0].lower()) if tag[0].lower() in ['a', 'r', 'n', 'v'] else word for word, tag in col]

### Applying preprocess on the the Content Column

In [49]:
df["CONTENT"]=df["CONTENT"].apply(preprocess)

### Performing the train-test split on the dataset (70/30 split)

In [50]:
X = df["CONTENT"]
Y = df["CLASS"]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = .3,random_state = 42)

### Performing the Vectorizer on the Content Column to convert the sparse matrix

In [97]:
tfidf = TfidfVectorizer(stop_words="english",max_features=1000)

xtr = tfidf.fit_transform(X_train)
xts = tfidf.transform(X_test)

### Initializing the Multi Layer Perceptron Characteristics Class

In [98]:
import torch 
class Perceptron(torch.nn.Module):
    def __init__(self):
        super(Perceptron, self).__init__()
        self.fc = torch.nn.Linear(1,1)
        self.relu = torch.nn.ReLU() # instead of Heaviside step fn
    def forward(self, x):
        output = self.fc(x)
        output = self.relu(x) # instead of Heaviside step fn
        return output

### Initializing the Feedforward functionality of the MLP

In [99]:
class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, 1)
            self.sigmoid = torch.nn.Sigmoid()
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
            output = self.sigmoid(output)
            return output

### Train Test split and converting it to floattensor

In [133]:
Xtrain = torch.FloatTensor(xtr.todense()).float()
Ytrain = torch.FloatTensor(np.array(Y_train)).long()

In [134]:
Xtest = torch.FloatTensor(xts.todense()).float()
Ytest = torch.FloatTensor(np.array(Y_test)).long()

### Implementation of the Model

In [135]:
model = Feedforward(1000,10)
Ytrain = Ytrain.unsqueeze(1).float()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

### Training of the Model and Calculating the Loss

In [137]:
model.train()
epoch = 20
for epoch in range(epoch):
    optimizer.zero_grad()
    # y_pred = y_pred.squeeze(1)
    # Forward pass
    y_pred = model(Xtrain)
    # Compute Loss
    loss = criterion(y_pred, Ytrain)

   
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
    pred = torch.max(y_pred, 1)[1].eq(Ytrain).sum()
    # print ("Accuracy is: ", (pred/len(Xtrain)).item())
    # Backward pass
    loss.backward()
    optimizer.step()

Epoch 0: train loss: 0.42702144384384155
Epoch 1: train loss: 0.40937769412994385
Epoch 2: train loss: 0.3919745981693268
Epoch 3: train loss: 0.3749280869960785
Epoch 4: train loss: 0.35832270979881287
Epoch 5: train loss: 0.34220466017723083
Epoch 6: train loss: 0.3266586661338806
Epoch 7: train loss: 0.31173065304756165
Epoch 8: train loss: 0.29746970534324646
Epoch 9: train loss: 0.2838950753211975
Epoch 10: train loss: 0.2710270285606384
Epoch 11: train loss: 0.25887617468833923
Epoch 12: train loss: 0.24743963778018951
Epoch 13: train loss: 0.2367090880870819
Epoch 14: train loss: 0.22666020691394806
Epoch 15: train loss: 0.2172727882862091
Epoch 16: train loss: 0.20852504670619965
Epoch 17: train loss: 0.20038393139839172
Epoch 18: train loss: 0.19281437993049622
Epoch 19: train loss: 0.18578431010246277


## Reference

### https://medium.com/biaslyai/pytorch-introduction-to-neural-network-feedforward-neural-network-model-e7231cff47cb

### https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216/11

### https://stackoverflow.com/questions/67845882/indexerror-target-1-is-out-of-bounds

### https://pytorch.org/text/stable/data_metrics.html