In [1]:
import numpy as np
import pandas as pd
import torch
import torchtext
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# load data from csv file
fields = ['news_article', 'news_category']

train_data = pd.read_csv('/content/inshort_news_data-train.csv', header=0, encoding='ISO-8859-1', usecols=fields, skip_blank_lines=True)
val_data = pd.read_csv('/content/inshort_news_data-val.csv', header=0, encoding='ISO-8859-1', usecols=fields, skip_blank_lines=True)
test_data = pd.read_csv('/content/inshort_news_data-test.csv', header=0, encoding='ISO-8859-1', usecols=fields, skip_blank_lines=True)


In [3]:
print('Num training articles: ', len(train_data))
print('Num validation articles: ', len(val_data))
print('Num testing articles: ', len(test_data))

Num training articles:  6380
Num validation articles:  1560
Num testing articles:  1742


In [4]:
# Creating training and testing data
X_train = train_data['news_article']
Y_train = train_data['news_category']
"""
Y_train = np.zeros((X_train.shape[0],1))
for i in range((X_train.shape[0])):
  for j in range(7):
    if (train[j+1][i]==1):
      Y_train[i]=j
"""
X_test = test_data['news_article']
Y_test = test_data['news_category']
"""
Y_test = np.zeros((X_test.shape[0],1))
for i in range((X_test.shape[0])):
  for j in range(7):
    if (test[j+1][i]==1):
      Y_test[i]=j
"""
X_val = val_data['news_article']
Y_val = val_data['news_category']

print (X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape)

(6380,) (6380,) (1560,) (1560,) (1742,) (1742,)


In [5]:
for i in range(X_train.shape[0]):
  X_train[i] = X_train[i].split()

for j in range(X_val.shape[0]):
  X_val[j] = X_val[j].split()

for k in range(X_test.shape[0]):
  X_test[k] = X_test[k].split()
    
Y_train = pd.get_dummies(Y_train).to_numpy()
Y_val = pd.get_dummies(Y_val).to_numpy()
Y_test = pd.get_dummies(Y_test).to_numpy()

In [6]:
# to see what is the largest number of words in a article
# np.unique(np.array([len(ix) for ix in X_train]) , return_counts=True)
np.unique(np.array([len(ix) for ix in X_val]) , return_counts=True)
# np.unique(np.array([len(ix) for ix in X_test]) , return_counts=True)

(array([43, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]),
 array([  1,   2,   2,   6,  11,  15,  22,  18,  39,  72,  90, 107, 159,
        312, 704]))

In [7]:
# stopwords to eliminate useless words
stopwords = []
stop = open('/content/stopwords.txt', encoding="utf-8")
for line in stop:
  stopwords.append(line.strip())
stop.close()

In [8]:
# utilize Glove6B for embedding
glove = torchtext.vocab.GloVe(name='6B', dim=50)

In [10]:
# Filling the embedding matrix
embedding_matrix_train = np.zeros((X_train.shape[0], 61, 50))
embedding_matrix_val = np.zeros((X_val.shape[0], 61, 50))
embedding_matrix_test = np.zeros((X_test.shape[0], 61, 50))

for i in range(X_train.shape[0]):
  for j in range(len(X_train[i])):
    if not (X_train[i][j].lower() in stopwords):
      embedding_matrix_train[i][j] = glove[X_train[i][j].lower()]

for i in range(X_val.shape[0]):
  for j in range(len(X_val[i])):
    if not (X_val[i][j].lower() in stopwords):
      embedding_matrix_val[i][j] = glove[X_val[i][j].lower()]

for i in range(X_test.shape[0]):
  for j in range(len(X_test[i])):
    if not (X_test[i][j].lower() in stopwords):
      embedding_matrix_test[i][j] = glove[X_test[i][j].lower()] 

In [11]:
X_train_t = torch.from_numpy(embedding_matrix_train).to(torch.float32)
Y_train_t = torch.from_numpy(Y_train).to(torch.float32)
X_val_t = torch.from_numpy(embedding_matrix_val).to(torch.float32)
Y_val_t = torch.from_numpy(Y_val).to(torch.float32)
X_test_t = torch.from_numpy(embedding_matrix_test).to(torch.float32)
Y_test_t = torch.from_numpy(Y_test).to(torch.float32)

In [13]:
train_dataset = TensorDataset(X_train_t, Y_train_t)
val_dataset = TensorDataset(X_val_t, Y_val_t)
test_dataset = TensorDataset(X_test_t, Y_test_t)

train_dataloader = DataLoader(train_dataset, batch_size=128)
val_dataloader = DataLoader(val_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)