In [1]:
import glob
import os
from random import shuffle
from nltk import TreebankWordTokenizer
import gensim
from bs4 import BeautifulSoup
import numpy as np
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D,GlobalMaxPooling1D,GlobalAveragePooling1D
from keras_tqdm import TQDMNotebookCallback


Using TensorFlow backend.


In [None]:
### Use 50% of my gpu, don't use it if you want to use more
config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
session = tf.compat.v1.Session(config=config)

In [None]:
# Add one to the positive text and 0 to the negative text
def pre_process_data(filepath):
    
    postitive_path = os.path.join(filepath,'pos')
    negative_path = os.path.join(filepath,'neg')
    positive_label = 1
    negative_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(postitive_path,"*.txt")):
        with open(filename,'r',encoding='utf-8') as f:
            text = f.read()
            soup = BeautifulSoup(text)
            text = soup.text
            dataset.append((positive_label,text))
    
    for filename in glob.glob(os.path.join(negative_path,"*.txt")):
        with open(filename,'r',encoding='utf-8') as f:
            text = f.read()
            soup = BeautifulSoup(text)
            text = soup.text
            dataset.append((negative_label,text))
    
    shuffle(dataset)
    
    return dataset

    


In [None]:
dataset = pre_process_data('aclImdb/train')

In [None]:
# get pre trained w2v, in the future try it with characters
word_vectors =  gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
#print and check
print(dataset[0][1])
print(dataset[0][0])

In [None]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []

    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data


def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected


In [None]:
# helper functions to get train and test sets
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectorized_data, expected, test_size=0.2, random_state=42)

In [None]:
print(np.shape(X_train))
print(np.shape(y_train))
print(np.shape(X_test))
print(np.shape(y_test))


In [None]:
def pad_truncate(data,maxlen=400):
    
    new_data = []
    
    zero_vector = []
    
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        
        if len(sample) >maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample 
            additional_elements = maxlen -len(sample)
            for _ in range(additional_elements):
                temp.append(zero_vector)
        else:
            temp = sample
            
        new_data.append(temp)
        
        
        
        
    return(new_data)




In [None]:
maxlen = 400
embedding_dimnsions =300


x_train = pad_truncate(X_train)
x_test = pad_truncate(X_test)

x_train = np.reshape(x_train,(len(x_train),maxlen,embedding_dimnsions))
y_train = np.array(y_train)

x_test = np.reshape(x_test,(len(x_test),maxlen,embedding_dimnsions))
y_test = np.array(y_test)


In [2]:
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

KeyboardInterrupt: 

In [None]:
batch_size = 16
filters = 250
kernal_size = 3
hidden_dimensions = 250
epochs = 2

model = Sequential()

model.add(Conv1D(
filters,
kernal_size,
padding= 'valid',
activation = "relu",
input_shape = (maxlen,embedding_dimnsions)))

model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dimensions))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss ='binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])




In [None]:
model.fit(x_train,y_train,
         batch_size = batch_size,
         epochs = epochs,
         validation_data = (x_test,y_test))