In [419]:
import os
import re
import shutil
import string
from collections import Counter
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [420]:
def get_category(category):
    if category == 'Adult':
        return 15
    elif category == 'Business/Corporate':
        return 14
    elif category == 'E-Commerce':
        return 13
    elif category == 'Computers and Technology':
        return 12
    elif category == 'Travel':
        return 11
    elif category == 'Sports':
        return 10
    elif category == 'Food':
        return 9
    elif category == 'Education':
        return 8
    elif category == 'Law and Government':
        return 7
    elif category == 'Games':
        return 6
    elif category == 'Streaming Services':
        return 5
    elif category == 'Health and Fitness':
        return 4
    elif category == 'News':
        return 3
    elif category == 'Social Networking and Messaging':
        return 2
    elif category == 'Forums':
        return 1
    elif category == 'Photography':
        return 0

In [421]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the XLSX file
xlsx_file_path = 'website_classification.xlsx'
data = pd.read_excel(xlsx_file_path)

# Shuffle the rows randomly
data = data.sample(frac=1, random_state=42)

# Split the data into training (70%), validation (10%), and testing (20%)
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=2/3, random_state=42)

# Save the training, validation, and testing data to separate XLSX files
train_data.to_excel('training.xlsx', index=False)
valid_data.to_excel('validation.xlsx', index=False)
test_data.to_excel('testing.xlsx', index=False)



In [422]:
#X_train ,Y_train,X_test,Y_test
testing_file_path = 'testing.xlsx'
trainingdata = pd.read_excel(testing_file_path)
training_file_path = 'training.xlsx'
testingdata = pd.read_excel(testing_file_path)
validation_file_path = 'validation.xlsx'
validationdata = pd.read_excel(validation_file_path)
X_train=trainingdata['cleaned_website_text']
Y_train=trainingdata['Category'].apply(get_category)
X_validate=validationdata['cleaned_website_text']
Y_validate=validationdata['Category'].apply(get_category)
X_test=testingdata['cleaned_website_text']
Y_test=testingdata['Category'].apply(get_category)

train_dat =list(zip(Y_train,X_train))
test_dat =list(zip(Y_test,X_test))
valid_dat=list(zip(Y_validate,X_validate))

In [423]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cpu")

In [424]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = train_dat
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [425]:
text_pipeline = lambda x: vocab(tokenizer(x))
category_pipeline = lambda x: int(x)

In [426]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(category_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64).to_dense()
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64).to_dense()
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0).to_dense()
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [427]:
from torch import nn
import torch.nn.functional as F


class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512,512)
        self.fc4 = nn.Linear(512,512)
        self.fc5 = nn.Linear(512,512)
        self.fc6 = nn.Linear(512,512)
        self.fc7 = nn.Linear(512,512)
        self.fc8 = nn.Linear(512, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()
        self.fc4.weight.data.uniform_(-initrange, initrange)
        self.fc4.bias.data.zero_()
        self.fc5.weight.data.uniform_(-initrange, initrange)
        self.fc5.bias.data.zero_()
        self.fc6.weight.data.uniform_(-initrange, initrange)
        self.fc6.bias.data.zero_()
        self.fc7.weight.data.uniform_(-initrange, initrange)
        self.fc7.bias.data.zero_()
        self.fc8.weight.data.uniform_(-initrange, initrange)
        self.fc8.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = self.fc8(x)
        return x

In [428]:
train_iter1 = train_dat
num_class = len(set([Category for (Category,cleaned_website_text ) in train_iter1]))
vocab_size = len(vocab)
emsize = 512
model = TextClassificationModel(vocab_size, emsize, num_class)
model=model.to(device)

In [429]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (Category, cleaned_website_textext, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(cleaned_website_textext, offsets)
        loss = criterion(predited_label, Category)
        loss.backward()
        # Manually clip gradients
        max_norm = 0.1  # Your desired maximum gradient norm
        total_norm = 0.0

        for param in model.parameters():
            if param.grad is not None:
                param_norm = param.grad.data.norm(2)
                total_norm += param_norm.item() ** 2

        total_norm = total_norm ** 0.5
        clip_coef = max_norm / (total_norm + 1e-6)

        if clip_coef < 1:
            for param in model.parameters():
                if param.grad is not None:
                    param.grad.data.mul_(clip_coef)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == Category).sum().item()
        total_count += Category.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [430]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 8 # epoch
LR = 5  # learning rate
BATCH_SIZE = 16 # batch size for training
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_iter2 = train_dat
test_iter2 =test_dat
valid_iter2=valid_dat
train_dataloader = DataLoader(train_iter2, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter2, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)
valid_dataloader=DataLoader(valid_iter2, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(test_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  1.23s | valid accuracy    0.504 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  1.24s | valid accuracy    0.770 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  1.59s | valid accuracy    0.794 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  1.38s | valid accuracy    0.872 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  1.31s | valid accuracy    0.940 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  1.37s |

In [431]:
print('Checking the results of test dataset.')
accu_test = evaluate(valid_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.752


In [432]:
category_label = {
                   15:"Adult",
                   14: "Business/Corporate",
                   13: "E-Commerce",
                   12: "Computers and Technology",
                   11: "Travel",
                   10: "Sports",
                   9: "Food",
                   8:"Education",
                   7:"Law and Government",
                   6: "Games",
                   5:"Streaming Services",
                   4: "Health and Fitness" ,
                   3: "News",
                   2:"Social Networking and Messaging",
                   1: "Forums",
                   0: "Photography"
}


def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text)).to_dense()
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()
ex_text_str = "soooooo wish i could, but im in school and myspace is completely blocked"
model = model.to("cpu")

print("This is a %s tweet" %category_label[predict(ex_text_str, text_pipeline)])

This is a Education tweet
