Imports

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import data_loading_code as pre
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sys import getsizeof
from torch.utils.tensorboard import SummaryWriter


In [12]:
torch.cuda.is_available()

True

Data loader:

In [5]:
 # get data, pre-process and split
data = pre.pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None) #read the file and preprocess it, for dataframe. new datastructure
df = pd.DataFrame(data)


# Split the DataFrame into 80% train and 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#training and validation
train_df.columns = ['Sentence', 'Class']
train_df['index'] = train_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
train_df = pre.preprocess_pandas(train_df, columns)                            # pre-process

training_data, validation_data, training_labels, validation_labels = pre.train_test_split( # split the data into training, validation, and test splits
    train_df['Sentence'].values.astype('U'),
    train_df['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

test_df.columns = ['Sentence', 'Class']
test_df['index'] = test_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
test_df = pre.preprocess_pandas(test_df, columns) 

test_data = test_df['Sentence'].values.astype('U')
test_labels = test_df['Class'].values.astype('int32')


# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = pre.TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)

validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

test_data = word_vectorizer.transform(test_data)        # transform texts to sparse matrix
test_data = test_data.todense()                             # convert to dense matrix for Pytorch

assert training_data.shape[1] == validation_data.shape[1] == test_data.shape[1], "Feature mismatch in TF-IDF!"

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()

validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

test_x_tensor = torch.from_numpy(np.array(test_data)).type(torch.FloatTensor)
test_y_tensor = torch.from_numpy(np.array(test_labels)).long()


In [6]:
print(test_x_tensor.shape)
print(train_x_tensor.shape)
print(validation_x_tensor.shape)


torch.Size([200, 5997])
torch.Size([720, 5997])
torch.Size([80, 5997])


Network

In [7]:

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.network = nn.Sequential(
        nn.Linear(vocab_size,1000),
        nn.ReLU(),
        nn.Linear(1000,100),
        nn.ReLU(),
        nn.Linear(100,25),
        nn.ReLU(),
        nn.Linear(25,2)
        )
    
    def feedforward(self,input):
        return self.network(input)
    
    
def backward(model, epochs, optimizer, loss_function, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor, device='cuda'):
    writer = SummaryWriter(log_dir="/tf/logs")
    model.to(device)
    train_Acc = 0
    train_Acc = 0
    validation_AccV = 0
    best_loss = 100
    
    for epoch in range(epochs):
        trainCorr = 0
        valCorr = 0
        
        for (sentences, labels) in zip(train_x_tensor,train_y_tensor):
            sentences, labels = sentences.to(device), labels.to(device)
            pred = model.feedforward(sentences)
            
            loss = loss_function(pred, labels) 
            writer.add_scalar("Loss/train", loss.item(), epoch)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
                
            pred = torch.argmax(F.softmax(pred), dim = 0)
            trainCorr += torch.sum(pred==labels).item()
        train_Acc = trainCorr/len(training_data)
            
        model.eval()
        with torch.no_grad():   
            for (sentencesv, labelsv) in zip(validation_x_tensor,validation_y_tensor):
                sentencesv, labelsv = sentencesv.to(device), labelsv.to(device)
                predv = model.feedforward(sentencesv)
                lossv = loss_function(predv,labelsv)
                writer.add_scalar("Loss/validation", loss.item(), epoch)

                if best_loss > lossv.item():
                    best_loss = lossv.item()
                    torch.save(model, "nlpbest")
                
                predv = torch.argmax(F.softmax(predv), dim = 0)
                valCorr += torch.sum(predv==labelsv).item()
            validation_AccV = valCorr/len(validation_data)
        print("epoch:", epoch, "Loss Training:", loss.item(), "Training acc:", train_Acc)
        print("epoch:", epoch, "Loss Validation:", lossv.item(),"Validation acc:", validation_AccV)
        writer.flush()
    writer.close()
    return model
        
def test_loop(model, loss_function, test_x_tensor, test_y_tensor, device='cuda'):
    model.to(device)
    testCorr = 0
    model.eval()
    best_loss = 100
    with torch.no_grad():   
        for (sentencesTest, labelsTest) in zip(test_x_tensor,test_y_tensor):
            sentencesTest, labelsTest = sentencesTest.to(device), labelsTest.to(device)
            predTest = model.feedforward(sentencesTest)
            lossTest = loss_function(predTest,labelsTest)

            if best_loss > lossTest.item():
                best_loss = lossTest.item()
            
            
            predTest = torch.argmax(F.softmax(predTest), dim = 0)
            testCorr += torch.sum(predTest==labelsTest).item()
        test_Acc = testCorr/len(test_data)
    print( "Loss Test:", lossTest.item(), "Test acc:", test_Acc)

    
                
            

Implemntation:

Small data set Amaxon cell labelled

In [8]:
epochs = 3
model = Net()


optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001, weight_decay=0.2)
criterion = nn.CrossEntropyLoss()

trained_model = backward(model, epochs, optimizer, criterion, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor)

test_loop(trained_model, criterion,test_x_tensor, test_y_tensor)

  pred = torch.argmax(F.softmax(pred))
  predv = torch.argmax(F.softmax(predv))


epoch: 0 Loss Training: 0.16985417902469635 Training acc: 0.6805555555555556
epoch: 0 Loss Validation: 0.45168471336364746 Validation acc: 0.8
epoch: 1 Loss Training: 5.960462772236497e-07 Training acc: 0.9708333333333333
epoch: 1 Loss Validation: 0.005050757434219122 Validation acc: 0.7875
epoch: 2 Loss Training: -0.0 Training acc: 0.9986111111111111
epoch: 2 Loss Validation: 0.014861115254461765 Validation acc: 0.8
Loss Test: 3.576272320060525e-06 Test acc: 0.86


  predTest = torch.argmax(F.softmax(predTest))


Train on large data set.

In [9]:
 # get data, pre-process and split
data = pre.pd.read_csv("amazon_cells_labelled_LARGE_25K.txt", delimiter='\t', header=None)
df = pd.DataFrame(data)


# Split the DataFrame into 80% train and 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#training and validation
train_df.columns = ['Sentence', 'Class']
train_df['index'] = train_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
train_df = pre.preprocess_pandas(train_df, columns)                            # pre-process

training_data, validation_data, training_labels, validation_labels = pre.train_test_split( # split the data into training, validation, and test splits
    train_df['Sentence'].values.astype('U'),
    train_df['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

test_df.columns = ['Sentence', 'Class']
test_df['index'] = test_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
test_df = pre.preprocess_pandas(test_df, columns) 

test_data = test_df['Sentence'].values.astype('U')
test_labels = test_df['Class'].values.astype('int32')


# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = pre.TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)

validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

test_data = word_vectorizer.transform(test_data)        # transform texts to sparse matrix
test_data = test_data.todense()                             # convert to dense matrix for Pytorch

assert training_data.shape[1] == validation_data.shape[1] == test_data.shape[1], "Feature mismatch in TF-IDF!"

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()

validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

test_x_tensor = torch.from_numpy(np.array(test_data)).type(torch.FloatTensor)
test_y_tensor = torch.from_numpy(np.array(test_labels)).long()


In [29]:
print(train_y_tensor.shape)

torch.Size([18000])


In [10]:
epochs = 3
model = Net()


optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

trained_model = backward(model, epochs, optimizer, criterion, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor)
test_loop(trained_model, criterion,test_x_tensor, test_y_tensor)

  pred = torch.argmax(F.softmax(pred))
  predv = torch.argmax(F.softmax(predv))


epoch: 0 Loss Training: 1.1358058452606201 Training acc: 0.8613888888888889
epoch: 0 Loss Validation: 1.326241374015808 Validation acc: 0.8925
epoch: 1 Loss Training: 0.41631880402565 Training acc: 0.9622777777777778
epoch: 1 Loss Validation: 0.033916376531124115 Validation acc: 0.8895
epoch: 2 Loss Training: 0.024427037686109543 Training acc: 0.9917777777777778
epoch: 2 Loss Validation: 3.765075206756592 Validation acc: 0.87


  predTest = torch.argmax(F.softmax(predTest))


Loss Test: 0.00029797881143167615 Test acc: 0.8796


In [13]:
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(str(row['Sentence']))
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
  
    return df_

# get data, pre-process and split
train_df = pre.pd.read_csv("train_df.csv", delimiter=',')
val_df = pre.pd.read_csv("val_df.csv", delimiter=',')
test_df = pre.pd.read_csv("test_df.csv", delimiter=',')

#training and validation
train_df.columns = ['index', 'Sentence', 'Class', 'Label']
train_df['index'] = train_df.index    # add new column index
columns = ['index', 'Sentence','Class', 'Label']
train_df = preprocess_pandas(train_df, columns)                            # pre-process
print(train_df['Class'])

val_df.columns = ['index','Sentence','Class','Label']
val_df['index'] = val_df.index    # add new column index
columns = ['index', 'Sentence','Class', 'Label']
val_df = preprocess_pandas(val_df, columns)                                 # pre-process
print(val_df['Class'])

test_df.columns = ['index','Sentence','Class', 'Label']
test_df['index'] = test_df.index    # add new column index
columns = ['index', 'Sentence','Class', 'Label']                            # pre-process
test_df = preprocess_pandas(test_df, columns) 
print(test_df['Class'])  

0        2
1        1
2        2
3        1
4        0
        ..
31227    0
31228    1
31229    2
31230    1
31231    0
Name: Class, Length: 31232, dtype: int64
0       0
1       2
2       2
3       2
4       2
       ..
5200    1
5201    2
5202    0
5203    2
5204    2
Name: Class, Length: 5205, dtype: int64
0       1
1       2
2       0
3       1
4       2
       ..
5201    0
5202    1
5203    0
5204    2
5205    2
Name: Class, Length: 5206, dtype: int64


In [20]:
training_data = train_df['Sentence'].values.astype('U')
training_labels = train_df['Class'].values.astype('int32')

validation_data = val_df['Sentence'].values.astype('U')
validation_labels = val_df['Class'].values.astype('int32')

test_data = test_df['Sentence'].values.astype('U')
test_labels = test_df['Class'].values.astype('int32')

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                       # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)

validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()   
 
test_data = word_vectorizer.transform(test_data)        # transform texts to sparse matrix
test_data = test_data.todense()                              # convert to dense matrix for Pytorch

assert training_data.shape[1] == validation_data.shape[1] == test_data.shape[1], "Feature mismatch in TF-IDF!"
print(training_data)

train_x_tensor = torch.from_numpy(training_data).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()

validation_x_tensor = torch.from_numpy(validation_data).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

test_x_tensor = torch.from_numpy(test_data).type(torch.FloatTensor)
test_y_tensor = torch.from_numpy(np.array(test_labels)).long()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.network = nn.Sequential(
        nn.Linear(vocab_size,1000),
        nn.ReLU(),
        nn.Linear(1000,100),
        nn.ReLU(),
        nn.Linear(100,25),
        nn.ReLU(),
        nn.Linear(25,3)
        )
    
    def feedforward(self,input):
        return self.network(input)
    
    
def backward(model, epochs, optimizer, loss_function, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor, device='cuda'):
    writer = SummaryWriter(log_dir="/tf/logs")
    model.to(device)
    train_Acc = 0
    train_Acc = 0
    validation_AccV = 0
    best_loss = 100
    
    for epoch in range(epochs):
        trainCorr = 0
        valCorr = 0
        
        for (sentences, labels) in zip(train_x_tensor,train_y_tensor):
            sentences, labels = sentences.to(device), labels.to(device)
            pred = model.feedforward(sentences)
            
            loss = loss_function(pred, labels)
            writer.add_scalar("Loss/train", loss.item(), epoch)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
                
            pred = torch.argmax(F.softmax(pred), dim = 0)
            trainCorr += torch.sum(pred==labels).item()
        train_Acc = trainCorr/len(training_data)
            
        model.eval()
        with torch.no_grad():   
            for (sentencesv, labelsv) in zip(validation_x_tensor,validation_y_tensor):
                sentencesv, labelsv = sentencesv.to(device), labelsv.to(device)
                predv = model.feedforward(sentencesv)
                lossv = loss_function(predv,labelsv)
                writer.add_scalar("Loss/validation", loss.item(), epoch)

                if best_loss > lossv.item():
                    best_loss = lossv.item()
                    torch.save(model, "nlpbest")
                
                predv = torch.argmax(F.softmax(predv), dim = 0)
                valCorr += torch.sum(predv==labelsv).item()
            validation_AccV = valCorr/len(validation_data)
        print("epoch:", epoch, "Loss Training:", loss.item(), "Training acc:", train_Acc)
        print("epoch:", epoch, "Loss Validation:", lossv.item(),"Validation acc:", validation_AccV)
        writer.flush()
    writer.close()
    return model
        
def test_loop(model, loss_function, test_x_tensor, test_y_tensor, device='cuda'):
    model.to(device)
    testCorr = 0
    model.eval()
    best_loss = 100
    with torch.no_grad():   
        for (sentencesTest, labelsTest) in zip(test_x_tensor,test_y_tensor):
            sentencesTest, labelsTest = sentencesTest.to(device), labelsTest.to(device)
            predTest = model.feedforward(sentencesTest)
            lossTest = loss_function(predTest,labelsTest)

            if best_loss > lossTest.item():
                best_loss = lossTest.item()
            
            
            predTest = torch.argmax(F.softmax(predTest), dim = 0)
            testCorr += torch.sum(predTest==labelsTest).item()
        test_Acc = testCorr/len(test_data)
    print( "Loss Test:", lossTest.item(), "Test acc:", test_Acc)

    
                
            

In [23]:
epochs = 10
model = Net()


optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

#trained_model = backward(model, epochs, optimizer, criterion, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor)
trained_model = torch.load("nlpbest", weights_only = False)
test_loop(trained_model, criterion,test_x_tensor, test_y_tensor)

  predTest = torch.argmax(F.softmax(predTest), dim = 0)


Loss Test: 2.95444655418396 Test acc: 0.629850172877449
