# Preprocessing

In [10]:
import numpy as np
import pandas as pd

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split


class Preprocessing:
	
	def __init__(self, args):
		self.data = 'tweets.csv'
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = args.test_size
		
	def load_data(self):
		df = pd.read_csv(self.data)
		df.drop(['id','keyword','location'], axis=1, inplace=True)
		
		X = df['text'].values
		Y = df['target'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.ModuleList):

	def __init__(self, args):
		super(TweetClassifier, self).__init__()
		
		self.batch_size = args.batch_size
		self.hidden_dim = args.hidden_dim
		self.LSTM_layers = args.lstm_layers
		self.input_size = args.max_words # embedding dimention
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=257)
		self.fc2 = nn.Linear(257, 1)
		
	def forward(self, x):
	
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out

In [12]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


from torch.utils.data import Dataset
from torch.utils.data import DataLoader

#from src import parameter_parser


class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]
		

class Execute:
	'''
	Class for execution. Initializes the preprocessing as well as the 
	Tweet Classifier model
	'''

	def __init__(self, args):
		self.__init_data__(args)
		
		self.args = args
		self.batch_size = args.batch_size
		
		self.model = TweetClassifier(args)
		
	def __init_data__(self, args):
		'''
		Initialize preprocessing from raw dataset to dataset split into training and testing
		Training and test datasets are index strings that refer to tokens
		'''
		self.preprocessing = Preprocessing(args)
		self.preprocessing.load_data()
		self.preprocessing.prepare_tokens()

		raw_x_train = self.preprocessing.x_train
		raw_x_test = self.preprocessing.x_test
		
		self.y_train = self.preprocessing.y_train
		self.y_test = self.preprocessing.y_test

		self.x_train = self.preprocessing.sequence_to_token(raw_x_train)
		self.x_test = self.preprocessing.sequence_to_token(raw_x_test)
		
	def train(self):
		
		training_set = DatasetMaper(self.x_train, self.y_train)
		test_set = DatasetMaper(self.x_test, self.y_test)
		
		self.loader_training = DataLoader(training_set, batch_size=self.batch_size)
		self.loader_test = DataLoader(test_set)
		
		optimizer = optim.RMSprop(self.model.parameters(), lr=args.learning_rate)
		for epoch in range(args.epochs):
			
			predictions = []
			
			self.model.train()
			
			for x_batch, y_batch in self.loader_training:
				
				x = x_batch.type(torch.LongTensor)
				y = y_batch.type(torch.FloatTensor)
				
				y_pred = self.model(x)
				
				loss = F.binary_cross_entropy(y_pred, y)
				
				optimizer.zero_grad()
				
				loss.backward()
				
				optimizer.step()
				
				predictions += list(y_pred.squeeze().detach().numpy())
			
			test_predictions = self.evaluation()
			
			train_accuary = self.calculate_accuray(self.y_train, predictions)
			test_accuracy = self.calculate_accuray(self.y_test, test_predictions)
			
			print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
			
	def evaluation(self):

		predictions = []
		self.model.eval()
		with torch.no_grad():
			for x_batch, y_batch in self.loader_test:
				x = x_batch.type(torch.LongTensor)
				y = y_batch.type(torch.FloatTensor)
				
				y_pred = self.model(x)
				predictions += list(y_pred.detach().numpy())
				
		return predictions
			
	@staticmethod
	def calculate_accuray(grand_truth, predictions):
		true_positives = 0
		true_negatives = 0
		
		for true, pred in zip(grand_truth, predictions):
			if (pred > 0.5) and (true == 1):
				true_positives += 1
			elif (pred < 0.5) and (true == 0):
				true_negatives += 1
			else:
				pass
				
		return (true_positives+true_negatives) / len(grand_truth)
	
# if __name__ == "__main__":
	
# 	args = parameter_parser()
	
# 	execute = Execute(args)
# 	execute.train()

In [13]:
class args:
    epochs = 10
    learning_rate = 0.01
    hidden_dim = 128
    lstm_layers = 2
    batch_size = 64
    test_size = 0.2
    max_len = 20
    max_words = 1000

In [14]:
execute = Execute(args)

In [16]:
execute.train()



Epoch: 1, loss: 0.55636, Train accuracy: 0.55977, Test accuracy: 0.62968
Epoch: 2, loss: 0.45177, Train accuracy: 0.63990, Test accuracy: 0.68418
Epoch: 3, loss: 0.68899, Train accuracy: 0.75632, Test accuracy: 0.71963
Epoch: 4, loss: 0.12717, Train accuracy: 0.81921, Test accuracy: 0.75968
Epoch: 5, loss: 0.07534, Train accuracy: 0.85665, Test accuracy: 0.75181
Epoch: 6, loss: 0.01073, Train accuracy: 0.88506, Test accuracy: 0.75181
Epoch: 7, loss: 0.02390, Train accuracy: 0.91494, Test accuracy: 0.74655
Epoch: 8, loss: 0.01437, Train accuracy: 0.92397, Test accuracy: 0.73605
Epoch: 9, loss: 0.02795, Train accuracy: 0.92660, Test accuracy: 0.74327
Epoch: 10, loss: 0.04867, Train accuracy: 0.93777, Test accuracy: 0.75181


In [18]:
#https://github.com/FernandoLpz/Text-Generation-BiLSTM-PyTorch
#https://github.com/AnubhavGupta3377/Text-Classification-Models-Pytorch
5//4

1

In [27]:
float("3".strip())

3.0