In [1]:
# importing libraries
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [2]:
# loading the csv and then removing rows with null balues
train_df = pd.read_csv('../input/midas-task/reddit_data.csv')
train_df.dropna(inplace=True)

# encoding text label to no. label
le = preprocessing.LabelEncoder()
le.fit(train_df["flair"])
train_df["label"] = le.transform(train_df["flair"])
print(le.classes_)
train_df.head()


['AskIndia' 'Business/Finance' 'Coronavirus' 'Non-Political'
 'Policy/Economy' 'Politics' 'Science/Technology']


Unnamed: 0,text,flair,dirty_text,label
0,top comments toi article drop us oil prices,Non-Political,Top comments on a TOI article about the drop i...,3
1,disappointed,Politics,Disappointed,5
2,hacking networking security 2 books 1 hacking ...,Non-Political,Hacking: Networking and Security (2 Books in 1...,3
3,zakir khan irfan junejo live instagram session...,Non-Political,Zakir Khan and Irfan Junejo live Instagram Ses...,3
4,cursing quentin tarantino movie,Non-Political,Cursing In A Quentin Tarantino Movie,3


In [3]:
# adding a row for the no. of words in a text 
train_df['text_length'] = train_df['dirty_text'].apply(lambda x: len(x.split()))

In [4]:
# initialising function for tokenization using spacy and preprocessing
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [5]:
# tokenization
counts = Counter()
for index, row in train_df.iterrows():
    counts.update(tokenize(row['dirty_text']))

In [6]:
# creating a vocabulary of words and creating a word to indez mapping
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [7]:
# function to encode text using tokenization function created above and vocabulary
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [8]:
# creating a column for the encoded text and encoding every row
train_df['encoded'] = train_df['dirty_text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
train_df.head()

Unnamed: 0,text,flair,dirty_text,label,text_length,encoded
0,top comments toi article drop us oil prices,Non-Political,Top comments on a TOI article about the drop i...,3,13,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,disappointed,Politics,Disappointed,5,1,"[[15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,hacking networking security 2 books 1 hacking ...,Non-Political,Hacking: Networking and Security (2 Books in 1...,3,16,"[[16, 17, 18, 19, 20, 21, 22, 11, 21, 16, 23, ..."
3,zakir khan irfan junejo live instagram session...,Non-Political,Zakir Khan and Irfan Junejo live Instagram Ses...,3,11,"[[29, 30, 19, 31, 32, 33, 34, 35, 36, 37, 38, ..."
4,cursing quentin tarantino movie,Non-Political,Cursing In A Quentin Tarantino Movie,3,6,"[[39, 11, 5, 40, 41, 42, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# taking the encoded text and label column and then splitting in trained and validation
X = list(train_df['encoded'])
y = list(train_df['label'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [10]:
# Initialising a class for PyTorch style dataset 
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [11]:
# creating training and validation dataset
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [12]:
# creating function for training
def train_model(model, epochs=10, lr=0.001):
    # extracting parameters for optimizer to update during training
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # initialising optimizer
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        #shifting model to training mode
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            # passing values to model to get output
            y_pred = model(x, l)
            # initialising gradients data to zero
            optimizer.zero_grad()
            # shifting label to cuda beacuse predctions are also on cuda
            y = y.cuda()
            # calculating loss
            loss = F.cross_entropy(y_pred, y)
            # computing gradients
            loss.backward()
            # applying gradients
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            # calculation metrics data like accuracies and loss
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        # printing after every 5 epochs
        if i % 5 == 0:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

# creating function for validation          
def validation_metrics (model, valid_dl):
    #shiftinh model to evaluation mode, this will disable dropout like layers
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        y = y.cuda()
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred.cpu(), y.cpu().unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total


In [13]:
batch_size = 5000
vocab_size = len(words)
# initialing dataloaders
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [14]:
class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        text = text.cuda()
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = pack_padded_sequence(embedded, text_lengths,batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [15]:
# creating model with the given values
model = classifier(vocab_size, 100, 32, 7, 3, bidirectional = True, dropout = 0.1)
# shifting model to cuda or GPU
model = model.to("cuda")

In [16]:
# Starting training
train_model(model, epochs=40, lr=0.1)


train loss 1.776, val loss 1.759, val accuracy 0.295, and val rmse 2.701
train loss 1.722, val loss 1.722, val accuracy 0.295, and val rmse 2.701
train loss 1.696, val loss 1.711, val accuracy 0.295, and val rmse 2.701
train loss 1.696, val loss 1.711, val accuracy 0.294, and val rmse 2.701
train loss 1.692, val loss 1.708, val accuracy 0.295, and val rmse 2.700
train loss 1.694, val loss 1.710, val accuracy 0.295, and val rmse 2.698
train loss 1.693, val loss 1.707, val accuracy 0.296, and val rmse 2.696
train loss 1.696, val loss 1.708, val accuracy 0.267, and val rmse 2.549


In [17]:
# saving the trained lstm model  
torch.save(model, "reddit_lstm.pt")

  "type " + obj.__name__ + ". It won't be checked "
