Imports

In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import data_loading_code as pre
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

Data loader:

In [77]:
 # get data, pre-process and split
data = pre.pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
df = pd.DataFrame(data)


# Split the DataFrame into 80% train and 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

#training and validation
train_df.columns = ['Sentence', 'Class']
train_df['index'] = train_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
train_df = pre.preprocess_pandas(train_df, columns)                            # pre-process

training_data, validation_data, training_labels, validation_labels = pre.train_test_split( # split the data into training, validation, and test splits
    train_df['Sentence'].values.astype('U'),
    train_df['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

test_df.columns = ['Sentence', 'Class']
test_df['index'] = test_df.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
test_df = pre.preprocess_pandas(test_df, columns) 

test_data = test_df['Sentence'].values.astype('U')
test_labels = test_df['Class'].values.astype('int32')


# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = pre.TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)

validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

test_data = word_vectorizer.transform(test_data)        # transform texts to sparse matrix
test_data = test_data.todense()                             # convert to dense matrix for Pytorch

assert training_data.shape[1] == validation_data.shape[1] == test_data.shape[1], "Feature mismatch in TF-IDF!"

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()

validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

test_x_tensor = torch.from_numpy(np.array(test_data)).type(torch.FloatTensor)
test_y_tensor = torch.from_numpy(np.array(test_labels)).long()


In [78]:
print(test_x_tensor.shape)
print(train_x_tensor.shape)
print(validation_x_tensor.shape)


torch.Size([200, 5997])
torch.Size([720, 5997])
torch.Size([80, 5997])


Network

In [49]:

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.network = nn.Sequential(
        nn.Linear(vocab_size,1000),
        nn.ReLU(),
        nn.Linear(1000,100),
        nn.ReLU(),
        nn.Linear(100,25),
        nn.ReLU(),
        nn.Linear(25,2)
        )
    
    def feedforward(self,input):
        return self.network(input)
    
    
def backward(model, epochs, optimizer, loss_function, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor):
    train_Acc = 0
    train_Acc = 0
    validation_AccV = 0
    best_loss = 100
    
    for epoch in range(epochs):
        trainCorr = 0
        valCorr = 0
        
        for (sentences, labels) in zip(train_x_tensor,train_y_tensor):
            pred = model.feedforward(sentences)
            
            loss = loss_function(pred, labels) 
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
                
            pred = torch.argmax(F.softmax(pred))
            trainCorr += torch.sum(pred==labels).item()
        train_Acc = trainCorr/len(training_data)
            
        model.eval()
        with torch.no_grad():   
            for (sentencesv, labelsv) in zip(validation_x_tensor,validation_y_tensor):
                
                predv = model.feedforward(sentencesv)
                lossv = loss_function(predv,labelsv)

                if best_loss > lossv.item():
                    best_loss = lossv.item()
                    torch.save(model, "nlpbest")
                
                predv = torch.argmax(F.softmax(predv))
                valCorr += torch.sum(predv==labelsv).item()
            validation_AccV = valCorr/len(validation_data)
        print("epoch:", epoch, "Loss Training:", loss.item(), "Training acc:", train_Acc)
        print("epoch:", epoch, "Loss Validation:", lossv.item(),"Validation acc:", validation_AccV)
    return model
        
def test_loop(model, loss_function, test_x_tensor, test_y_tensor):
    testCorr = 0
    model.eval()
    with torch.no_grad():   
        for (sentencesTest, labelsTest) in zip(test_x_tensor,test_y_tensor):
            
            predTest = model.feedforward(sentencesTest)
            lossTest = loss_function(predTest,labelsTest)

            if best_loss > lossTest.item():
                best_loss = lossTest.item()
            
            
            predTest = torch.argmax(F.softmax(predTest))
            testCorr += torch.sum(predTest==labelsTest).item()
        test_Acc = testCorr/len(test_data)
    print( "Loss Test:", lossTest.item(), "Test acc:", test_Acc)

    
                
            

Implemntation:

Small data set Amaxon cell labelled

In [51]:
epochs = 3
model = Net()


optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001, weight_decay=0.2)
criterion = nn.CrossEntropyLoss()

trained_model = backward(model, epochs, optimizer, criterion, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor)

test_loop(trained_model, criterion,test_x_tensor, test_y_tensor)

  pred = torch.argmax(F.softmax(pred))
  predv = torch.argmax(F.softmax(predv))


epoch: 0 Loss Training: 0.09940920025110245 Training acc: 0.6736111111111112
epoch: 0 Loss Validation: 0.2976081669330597 Validation acc: 0.8
epoch: 1 Loss Training: 1.1920928244535389e-07 Training acc: 0.975
epoch: 1 Loss Validation: 0.0005050813779234886 Validation acc: 0.8125
epoch: 2 Loss Training: 6.437280717364047e-06 Training acc: 1.0
epoch: 2 Loss Validation: 0.008211423642933369 Validation acc: 0.8


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2164 and 5997x1000)

Train on large data set.

In [4]:
# get data, pre-process and split
data = pre.pd.read_csv("amazon_cells_labelled_LARGE_25K.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = pre.preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = pre.train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = pre.TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()
train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

In [9]:
print(train_y_tensor.shape)

torch.Size([22500])


In [7]:
epochs = 3
model = Net()


optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

trained_model = backward(model, epochs, optimizer, criterion, train_x_tensor, train_y_tensor, validation_x_tensor, validation_y_tensor)

  pred = torch.argmax(F.softmax(pred))


KeyboardInterrupt: 