## Task 2: NLP task - Sentiment analysis 

In this NPL task, two ML models are trained and compared: logistic regression and a CNN. Next is the code of this task.

#### Importing libraries

In [1]:
import os
from pathlib import Path
import pickle
import random
import numpy as np
import gensim
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from keras.layers import Convolution1D, Flatten, Dropout, Dense
from keras.models import Sequential
from keras.callbacks import History
from keras.models import load_model

Using TensorFlow backend.


#### Filenames and ML models' settings:

In [2]:
#train vars
train_posRews_dir = '/home/ana/PycharmProjects/movieReviews/data/train/pos'
train_negRews_dir = '/home/ana/PycharmProjects/movieReviews/data/train/neg'
train_words_list_fname = "trainWords_list.pkl"
d2v_model_fname = "d2v.model"
CNN_model_fname = "CNNmodel.h5"
CNN_history_fname = 'trainHistory'

#test vars
test_posRews_dir = '/home/ana/PycharmProjects/movieReviews/data/test/pos'
test_negRews_dir = '/home/ana/PycharmProjects/movieReviews/data/test/neg'
test_words_list_fname = "testWords_list.pkl"
new_d2v_model_fname = "new_d2v.model"
predictions_folder = '/home/ana/PycharmProjects/movieReviews/data/predictions'

#doc2vec model settings
d2v_settings = dict(
    min_count = 1,
    window = 8,
    vector_size = 100,
    sample = 1e-4,
    negative= 5 ,
    workers = 4,
    train_epochs = 10
)

#CNN_model_settings
CNN_settings = dict(
    batch_size=128,
    epochs=10,
    verbose=0
)

#### Functions' definitions:

First, are defined the functions related with dataset (reviews) processing and extraction of (tokenized into words) sentences.

In [3]:
### Obtain a list of (tokenized into words, and pre-processed) documents (each one containing the sentences
#from the corresponding review) from the text files within directory
### A prefix is also added to identify the kind of document from the text files within directory 
#(positive or negative review)
def getPreProcessedDocs(dir, prefix, labeled):

    # list initialization
    words = []
    ids = []

    # get tokenized sentences from positive and negative reviews
    i = -1
    print("Extracting tokenized sentences...")
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.endswith(".txt"):
                #print(os.path.join(root, file))
                with open(os.path.join(root, file), 'r') as infile:
                    text = infile.read()
                    # pre-processing (word tokenization, plus lowercasing, remove numbers and puntuaction)
                    if labeled==False:
                        words.append(gensim.utils.simple_preprocess(text))
                    #pre-processing and tagging
                    else:
                        i += 1
                        words.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text), [prefix + '_' + str(i)]))
                        ids.append(re.sub('\.txt$', '', file))

    return [words, ids]

###Permutate the order of the tokenized sentences
def sentencePerm(sentences):
    shuffled_sentences = list(sentences)
    random.shuffle(shuffled_sentences)
    return shuffled_sentences

###Extract from raw text, list of words in the proper format for input of doc2vec (and word2vec) model
def rawText2Sentences_extraction(words_list_fname,posRews_dir,negRews_dir):

    if Path(words_list_fname).exists():
        with open(words_list_fname, "rb") as fp: #unpickling
            [words, ids] = pickle.load(fp)

    else:

        pos_words, pos_ids = getPreProcessedDocs(posRews_dir,'POS',True)
        neg_words, neg_ids = getPreProcessedDocs(negRews_dir, 'NEG',True)

        words = pos_words + neg_words
        ids = pos_ids + neg_ids


        with open(words_list_fname, "wb") as fp: #pickling
            pickle.dump([words, ids], fp)

    return [words, ids]

Next, the functions related with computing *doc2vector models* are defined. Also, a function (__docVectors2feats_extraction()__) is defined to extract from the trained doc2vec model, features in appropriate format to be fed to the sentiment analysis classifier.

In [4]:
###Compute a doc2vec model
def doc2vec_extraction(d2v_model_fname, words_list_fname, train_posRews_dir, train_negRews_dir, d2v_settings):

    # load trained d2v model
    if Path(d2v_model_fname).exists() and Path(words_list_fname).exists():

        d2v_model = gensim.models.Doc2Vec.load(d2v_model_fname)

        with open(words_list_fname, "rb") as fp: #unpickling
            ids = pickle.load(fp)[1]

    #train d2v model
    else:

        train_words, ids = rawText2Sentences_extraction(words_list_fname, train_posRews_dir, train_negRews_dir)

        # training of word2vec model
        print("Training doc2vec model...")

        d2v_model = gensim.models.Doc2Vec(min_count=d2v_settings['min_count'], window=d2v_settings['window'],
                                          vector_size=d2v_settings['vector_size'], sample=d2v_settings['sample'],
                                          negative=d2v_settings['negative'], workers=d2v_settings['workers'])
        d2v_model.build_vocab(train_words)
        d2v_model.train(sentencePerm(train_words), total_examples=len(train_words), epochs=d2v_settings['train_epochs'])

        d2v_model.save(d2v_model_fname)

    return [d2v_model, ids]

##Compute a doc2vec model, based on a previous model from which the vocabulary is updated
def updatedVocab_doc2vec_extraction(d2v_model, new_d2v_model_fname, new_words_list_fname, new_posRews_dir, new_negRews_dir, d2v_settings):

    # load trained new d2v model
    if Path(new_d2v_model_fname).exists() and Path(new_words_list_fname).exists():

        new_d2v_model = gensim.models.Doc2Vec.load(new_d2v_model_fname)

        with open(new_words_list_fname, "rb") as fp: #unpickling
            ids = pickle.load(fp)[1]

    # train new d2v model
    else:

        new_train_words, ids = rawText2Sentences_extraction(new_words_list_fname, new_posRews_dir, new_negRews_dir)

        print("Training new doc2vec model, with updated vocabulary...")

        new_d2v_model = copy.deepcopy(d2v_model)

        #update vocabulary with new reviews and train model
        new_d2v_model.build_vocab(new_train_words, update=True)
        new_d2v_model.train(sentencePerm(new_train_words), total_examples=len(new_train_words), epochs=d2v_settings['train_epochs'])

        new_d2v_model.save(new_d2v_model_fname)

    return [new_d2v_model, ids]

###Extract from the doc2vec vectors, vectors in a proper format to be used as input features for the classifier
def docVectors2feats_extraction(doc2vecModel):
    N = doc2vecModel.corpus_count
    dim = doc2vecModel.layer1_size
    N_pos = int(N/2) #number of positive review sentences (same as negative)
    try:
        N % 2 == 0
    except:
        print ("N variable is not an even number")

    feats_X = np.zeros((N, dim))
    feats_y = np.zeros((N,1))

    for i in range(N_pos):
        prefix_pos = 'POS_' + str(i)
        prefix_neg = 'NEG_' + str(i)
        feats_X[i,:] = doc2vecModel[prefix_pos]
        feats_X[N_pos + i,:] = doc2vecModel[prefix_neg]
        #1 for positive, 0 for negative
        feats_y[i] = int(1)
        feats_y[N_pos + i] = int(0)

    return [feats_X, feats_y]

This next function creates the structure of the *CNN model* used as one of the classifiers checked in this task. This CNN is a three-layered network, that has two dense layers and one convolutional layer.

In [5]:
###Create CNN model
def create_CNN_model(input_dim, input_len):

    embedding_vector_length = 300 #output dimension
    CNN_model = Sequential()

    #CNN_model.add(Embedding(input_dim, embedding_vector_length, input_length=input_len))

    CNN_model.add(Convolution1D(64, 3, input_shape=(input_dim,input_len), padding='same'))
    CNN_model.add(Convolution1D(32, 3, padding='same'))
    CNN_model.add(Convolution1D(16, 3, padding='same'))
    CNN_model.add(Flatten())
    CNN_model.add(Dropout(0.2))

    CNN_model.add(Dense(180, activation='sigmoid'))
    CNN_model.add(Dropout(0.2))

    CNN_model.add(Dense(1, activation='sigmoid'))

    return CNN_model


#### Main:

#### doc2vec model training

First, a doc2vec model is trained from the training set. The function __doc2vec_extraction()__ performs the training. Within this function, the following is done:

1. Extract the sentences from the reviews into lists and pre-process the sentences prior to doc2vec model training (such as word-tokenization and lowercasing). A tag is added to each of the review documents (using Gensim object *TaggedDocument*), indicating if the review is positive or negative. (functions used:__getPreProcessedSentences()__ and __rawText2Sentences_extraction()__ )

2. Train the doc2vec model using Gensim library (function: __doc2vec_extraction()__


In [6]:
#doc2vec model training
d2v_model = doc2vec_extraction(d2v_model_fname, train_words_list_fname, train_posRews_dir, train_negRews_dir, d2v_settings)[0]

Extracting tokenized sentences...
Extracting tokenized sentences...
Training doc2vec model...


Once the model is trained, this one can be checked to see if it works properly.

In [7]:
#checking that d2v model works
w1='excellent'
print(w1 + ":")
print(d2v_model.most_similar (positive=w1))

w2='awful'
print(w2 + ":")
print(d2v_model.most_similar (positive=w2))

excellent:


  after removing the cwd from sys.path.


[('outstanding', 0.8744109869003296), ('superb', 0.8085336685180664), ('fantastic', 0.8015602231025696), ('fine', 0.783616840839386), ('terrific', 0.7822716236114502), ('brilliant', 0.7714279294013977), ('great', 0.7692869305610657), ('wonderful', 0.7537065744400024), ('stellar', 0.7373148202896118), ('exceptional', 0.7346345782279968)]
awful:
[('terrible', 0.8614509105682373), ('horrible', 0.8499147295951843), ('dreadful', 0.79315584897995), ('horrendous', 0.788529634475708), ('atrocious', 0.7601053714752197), ('bad', 0.7578982710838318), ('lousy', 0.7576748132705688), ('abysmal', 0.7467353343963623), ('laughable', 0.7288968563079834), ('sucks', 0.7165589928627014)]


  


#### Training of review classifier (sentiment analysis classifier)

For this, two ML classifiers are trained and compared using validation data, and the one that performs better is chosen for prediction of the test set. The two models are a logistic regression and a CNN.

1. Prior to training, features are extracted from the trained doc2vec model, such that those features can be used as input for the classifiers.

In [8]:
#Training of review classifier, and prediction with it:

#extracting features to fed to the classifier

# for training
X_train, y_train = docVectors2feats_extraction(d2v_model)

# for prediction
new_d2v_model, ids_test = updatedVocab_doc2vec_extraction(d2v_model, new_d2v_model_fname, test_words_list_fname, test_posRews_dir, test_negRews_dir, d2v_settings)
X_test, y_test = docVectors2feats_extraction(new_d2v_model)

Extracting tokenized sentences...




Extracting tokenized sentences...
Training new doc2vec model, with updated vocabulary...


2. Model selection between logistic regression and the CNN is done using validation data. So the models are trained and:
            * For logistic regression, the model is evaluated using crossvalidation (k-folding, with k=5)
            * In the case of CNN, to avoid computationally expensive computations, just plain split of the training set in two parts is used: 60% is used for training, while 40% of the training set is used for validation.           

In [9]:
#Model selection

# 1 -logistic regression
logRegr = LogisticRegression() #initialization, default settings

#cross-validation for each model
logRegr_scores = cross_val_score(logRegr, X_train, np.ravel(y_train), cv=5)
print("Logistic Regression - Crossvalidation) Accuracy: %0.4f (+/- %0.4f)" % (logRegr_scores.mean(), logRegr_scores.std() * 2))

# 2 - CNN

#CNN training:

#reshape data for input on the CNN model

if Path(CNN_model_fname).exists() and Path(CNN_history_fname).exists():
    CNN_model = load_model(CNN_model_fname)
    with open(CNN_history_fname, 'rb') as fp:
        CNN_history = pickle.load(fp)
else:
    X_train_resh = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

    #split train data into new_train and validation sets
    #(cross-validation not used here, due to the computational cost)
    [X_train_new, X_val, y_train_new, y_val] = train_test_split(X_train_resh, y_train, test_size=0.4, random_state=0)

    col_len = X_train_new.shape[2]
    row_len = X_train_new.shape[1]

    CNN_model = create_CNN_model(row_len, col_len)
    CNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    #training
    history = History()

    print("Training CNN...")
    CNN_model.fit(X_train_new, y_train_new, batch_size=CNN_settings['batch_size'], epochs=CNN_settings['epochs'], verbose=CNN_settings['verbose'], validation_data=(X_val, y_val), callbacks=[history])

    CNN_history = history.history

    with open(CNN_history_fname, 'wb') as fp:
        pickle.dump(CNN_history, fp)

    CNN_model.save(CNN_model_fname)

val_acc = CNN_history['val_acc'][-1]

print('CNN - Validation accuracy: %0.4f ' % val_acc)




Logistic Regression - Crossvalidation) Accuracy: 0.8458 (+/- 0.0173)
Training CNN...
CNN - Validation accuracy: 0.8461 


Both models have very similar accuracy scores, therefore logistic regression is chosen since it is computationally less expensive.

3. Next the logistic regression is re-trained using the whole training data.
4. Predictions of the sentiment (positive or negative) are then predicted on the test set using the trained classifier. (predictions are formatted in this script such that: 1 - positive and 0 - negative.)
5. Finally, classification scores are obtained. Accuracy score is used.

In [10]:
#Model training of logistic regression (with all training data)
logRegr.fit(X_train, np.ravel(y_train))

#Prediction

y_predictd_test = logRegr.predict(X_test)

#Score:
print("Logistic Regression - Test Accuracy: %0.4f " % logRegr.score(X_test, np.ravel(y_test)))


Logistic Regression - Test Accuracy: 0.8951 


The predictions are then exported into a csv file using Pandas library.

In [11]:
#Prediction export into csv file:
pred_df = pd.DataFrame(columns=['Review_ID','Sentiment'])
pred_df['Review_ID'] = ids_test
pred_df['Sentiment'] = y_predictd_test #1 positive, 0 negative

if not os.path.exists(predictions_folder):
    os.makedirs(predictions_folder)

pred_df.to_csv(predictions_folder + '/movieReviews_preds.csv',index=False)
