# Preprocessing

In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

class Preprocessing:
	
	def __init__(self, num_words, seq_len):
		self.data = 'tweets.csv'
		self.num_words = num_words
		self.seq_len = seq_len
		self.vocabulary = None
		self.x_tokenized = None
		self.x_padded = None
		self.x_raw = None
		self.y = None
		
		self.x_train = None
		self.x_test = None
		self.y_train = None
		self.y_test = None
		
	def load_data(self):
		# Reads the raw csv file and split into
		# sentences (x) and target (y)
		
		df = pd.read_csv(self.data)
		df.drop(['id','keyword','location'], axis=1, inplace=True)
		
		self.x_raw = df['text'].values
		self.y = df['target'].values
		
	def clean_text(self):
		# Removes special symbols and just keep
		# words in lower or upper form
		
		self.x_raw = [x.lower() for x in self.x_raw]
		self.x_raw = [re.sub(r'[^A-Za-z]+', ' ', x) for x in self.x_raw]
		
	def text_tokenization(self):
		# Tokenizes each sentence by implementing the nltk tool
	   self.x_raw = [word_tokenize(x) for x in self.x_raw]
	   
	def build_vocabulary(self):
		# Builds the vocabulary and keeps the "x" most frequent words
	   self.vocabulary = dict()
	   fdist = nltk.FreqDist()
	   
	   for sentence in self.x_raw:
	      for word in sentence:
	         fdist[word] += 1
	         
	   common_words = fdist.most_common(self.num_words)
	   
	   for idx, word in enumerate(common_words):
	      self.vocabulary[word[0]] = (idx+1)
	      
	def word_to_idx(self):
		# By using the dictionary (vocabulary), it is transformed
		# each token into its index based representation
		
	   self.x_tokenized = list()
	   
	   for sentence in self.x_raw:
	      temp_sentence = list()
	      for word in sentence:
	         if word in self.vocabulary.keys():
	            temp_sentence.append(self.vocabulary[word])
	      self.x_tokenized.append(temp_sentence)
	      
	def padding_sentences(self):
		# Each sentence which does not fulfill the required len
		# it's padded with the index 0
		
	   pad_idx = 0
	   self.x_padded = list()
	   
	   for sentence in self.x_tokenized:
	      while len(sentence) < self.seq_len:
	         sentence.insert(len(sentence), pad_idx)
	      self.x_padded.append(sentence)
	   
	   self.x_padded = np.array(self.x_padded)
	   
	def split_data(self):
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_padded, self.y, test_size=0.25, random_state=42)

# Model Definition

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextClassifier(nn.ModuleList):

	def __init__(self, params):
		super(TextClassifier, self).__init__()

		# Parameters regarding text preprocessing
		self.seq_len = params.seq_len
		self.num_words = params.num_words
		self.embedding_size = params.embedding_size
		
		# Dropout definition
		self.dropout = nn.Dropout(0.25)
		
		# CNN parameters definition
		# Kernel sizes
		self.kernel_1 = 2
		self.kernel_2 = 3
		self.kernel_3 = 4
		self.kernel_4 = 5
		
		# Output size for each convolution
		self.out_size = params.out_size
		# Number of strides for each convolution
		self.stride = params.stride
		
		# Embedding layer definition
		self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)
		
		# Convolution layers definition
		self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
		self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
		self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
		self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)
		
		# Max pooling layers definition
		self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
		self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
		self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
		self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
		
		# Fully connected layer definition
		self.fc = nn.Linear(self.in_features_fc(), 1)

		
	def in_features_fc(self):
		'''Calculates the number of output features after Convolution + Max pooling
			
		Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
		Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
		
		source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
		'''
		# Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
		out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
		out_conv_1 = math.floor(out_conv_1)
		out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
		out_pool_1 = math.floor(out_pool_1)
		
		# Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
		out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
		out_conv_2 = math.floor(out_conv_2)
		out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
		out_pool_2 = math.floor(out_pool_2)
		
		# Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
		out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
		out_conv_3 = math.floor(out_conv_3)
		out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
		out_pool_3 = math.floor(out_pool_3)
		
		# Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
		out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
		out_conv_4 = math.floor(out_conv_4)
		out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
		out_pool_4 = math.floor(out_pool_4)
		
		# Returns "flattened" vector (input for fully connected layer)
		return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size
		
		
		
	def forward(self, x):

		# Sequence of tokes is filterd through an embedding layer
		x = self.embedding(x)
		
		# Convolution layer 1 is applied
		x1 = self.conv_1(x)
		x1 = torch.relu(x1)
		x1 = self.pool_1(x1)
		
		# Convolution layer 2 is applied
		x2 = self.conv_2(x)
		x2 = torch.relu((x2))
		x2 = self.pool_2(x2)
	
		# Convolution layer 3 is applied
		x3 = self.conv_3(x)
		x3 = torch.relu(x3)
		x3 = self.pool_3(x3)
		
		# Convolution layer 4 is applied
		x4 = self.conv_4(x)
		x4 = torch.relu(x4)
		x4 = self.pool_4(x4)
		
		# The output of each convolutional layer is concatenated into a unique vector
		union = torch.cat((x1, x2, x3, x4), 2)
		union = union.reshape(union.size(0), -1)

		# The "flattened" vector is passed through a fully connected layer
		out = self.fc(union)
		# Dropout is applied		
		out = self.dropout(out)
		# Activation function is applied
		out = torch.sigmoid(out)
		
		return out.squeeze()

# Parameters

In [4]:
from dataclasses import dataclass

@dataclass
class Parameters:
   # Preprocessing parameeters
   seq_len: int = 35
   num_words: int = 2000
   
   # Model parameters
   embedding_size: int = 64
   out_size: int = 32
   stride: int = 2
   
   # Training parameters
   epochs: int = 10
   batch_size: int = 12
   learning_rate: float = 0.001

In [5]:
import torch
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

class DatasetMaper(Dataset):

	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

class Run:
	'''Training, evaluation and metrics calculation'''

	@staticmethod
	def train(model, data, params):
		
		# Initialize dataset maper
		train = DatasetMaper(data['x_train'], data['y_train'])
		test = DatasetMaper(data['x_test'], data['y_test'])
		
		# Initialize loaders
		loader_train = DataLoader(train, batch_size=params.batch_size)
		loader_test = DataLoader(test, batch_size=params.batch_size)
		
		# Define optimizer
		optimizer = optim.RMSprop(model.parameters(), lr=params.learning_rate)
		
		# Starts training phase
		for epoch in range(params.epochs):
			# Set model in training model
			model.train()
			predictions = []
			# Starts batch training
			for x_batch, y_batch in loader_train:
			
				y_batch = y_batch.type(torch.FloatTensor)
				
				# Feed the model
				y_pred = model(x_batch)
				
				# Loss calculation
				loss = F.binary_cross_entropy(y_pred, y_batch)
				
				# Clean gradientes
				optimizer.zero_grad()
				
				# Gradients calculation
				loss.backward()
				
				# Gradients update
				optimizer.step()
				
				# Save predictions
				predictions += list(y_pred.detach().numpy())
			
			# Evaluation phase
			test_predictions = Run.evaluation(model, loader_test)
			
			# Metrics calculation
			train_accuary = Run.calculate_accuray(data['y_train'], predictions)
			test_accuracy = Run.calculate_accuray(data['y_test'], test_predictions)
			print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
			
	@staticmethod
	def evaluation(model, loader_test):
		
		# Set the model in evaluation mode
		model.eval()
		predictions = []
		
		# Starst evaluation phase
		with torch.no_grad():
			for x_batch, y_batch in loader_test:
				y_pred = model(x_batch)
				predictions += list(y_pred.detach().numpy())
		return predictions
		
	@staticmethod
	def calculate_accuray(grand_truth, predictions):
		# Metrics calculation
		true_positives = 0
		true_negatives = 0
		for true, pred in zip(grand_truth, predictions):
			if (pred >= 0.5) and (true == 1):
				true_positives += 1
			elif (pred < 0.5) and (true == 0):
				true_negatives += 1
			else:
				pass
		# Return accuracy
		return (true_positives+true_negatives) / len(grand_truth)
		

In [6]:
class Controller(Parameters):
	
	def __init__(self):
		# Preprocessing pipeline
		self.data = self.prepare_data(Parameters.num_words, Parameters.seq_len)
		
		# Initialize the model
		self.model = TextClassifier(Parameters)
		
		# Training - Evaluation pipeline
		Run().train(self.model, self.data, Parameters)
		
		
	@staticmethod
	def prepare_data(num_words, seq_len):
		# Preprocessing pipeline
		pr = Preprocessing(num_words, seq_len)
		pr.load_data()
		pr.clean_text()
		pr.text_tokenization()
		pr.build_vocabulary()
		pr.word_to_idx()
		pr.padding_sentences()
		pr.split_data()

		return {'x_train': pr.x_train, 'y_train': pr.y_train, 'x_test': pr.x_test, 'y_test': pr.y_test}
		
# if __name__ == '__main__':
# 	controller = Controller()

In [7]:
controller = Controller()

Epoch: 1, loss: 0.47676, Train accuracy: 0.57278, Test accuracy: 0.65231
Epoch: 2, loss: 0.31520, Train accuracy: 0.66124, Test accuracy: 0.70378


KeyboardInterrupt: 