# Experimentation with Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import glob
import json
import math
import os
import random
import statistics
import svmlight

## Load Pang et al. dataset (2000 reviews)

In [None]:
def get_features_from_file(file_path):
    # Get list of unigrams from a file
    with open(file_path, 'r') as f:
        features = simple_preprocess(f.read())
    return features

def get_all_features():
    # Get all truples of (sentiment, file_name, list_of_unigrams)
    pos_path = "../POS"
    neg_path = "../NEG"

    pos_files = sorted(list(filter(lambda x: x.endswith(".tag"), os.listdir(pos_path))))
    neg_files = sorted(list(filter(lambda x: x.endswith(".tag"), os.listdir(neg_path))))

    features = []

    for f in pos_files:
        features.append(("POS", f, get_features_from_file(pos_path + "/" + f)))

    for f in neg_files:
        features.append(("NEG", f, get_features_from_file(neg_path + "/" + f)))

    return features

## Index each word in a set starting from 1

In [None]:
def get_feature_indices(data_set):
    # Create a dictionary assigning each word an index starting from 1
    feature_indices = {}
    index = 1
    for (sentiment, file_name, features) in data_set:
        for w in features:
            if w not in feature_indices:
                feature_indices[w] = index
                index += 1
    return feature_indices

## SVM using unigrams + frequency/presence

In [None]:
def svm(training_set, test_set, presence):
    feature_indices = get_feature_indices(training_set + test_set)

    # Train the SVM model on the training set
    formatted_training_set = []

    for (sentiment, file_name, features) in training_set:
        feature_vec = []
        feature_freqs = {}

        for w in features:
            if presence or w not in feature_freqs:
                feature_freqs[w] = 1
            else:
                feature_freqs[w] += 1

        for word, count in feature_freqs.items():
            feature_vec.append((feature_indices[word], count))

        list.sort(feature_vec, key=lambda x: x[0])

        sent_val = 1 if sentiment == "POS" else -1
        formatted_training_set.append((sent_val, feature_vec))

    model = svmlight.learn(formatted_training_set)
    
    # Test the SVM model on the test set
    formatted_test_set = []
    
    for (sentiment, file_name, features) in test_set:
        feature_vec = []
        feature_freqs = {}

        for w in features:
            if presence or w not in feature_freqs:
                feature_freqs[w] = 1
            else:
                feature_freqs[w] += 1

        for word, count in feature_freqs.items():
            feature_vec.append((feature_indices[word], count))

        list.sort(feature_vec, key=lambda x: x[0])
        formatted_test_set.append((0, feature_vec))

    predictions = svmlight.classify(model, formatted_test_set)

    # Format the predictions into truples of (predicted_sentiment, actual_sentiment, file_name)
    formatted_predictions = []
    idx = 0
    for (sentiment, file_name, features) in test_set:
        formatted_predictions.append(("POS" if predictions[idx] > 0 else "NEG", sentiment, file_name))
        idx += 1
    
    return formatted_predictions

## Preprocessing

In [None]:
neg_train = '../aclImdb/train/neg/*'
pos_train = '../aclImdb/train/pos/*'
unsup_train = '../aclImdb/train/unsup/*'
neg_test = '../aclImdb/test/neg/*'
pos_test = '../aclImdb/test/pos/*'
train_corpus = None


def read_docs(paths_list):
    # Return a list of TaggedDocuments from a list of paths
    for i in range(len(paths_list)):
        with open(paths_list[i]) as doc:
            tokens = simple_preprocess(doc.readline())
            
            # We tag each document
            yield TaggedDocument(tokens, [i])


train_path_list = glob.glob(neg_train)
train_path_list.extend(glob.glob(pos_train))
train_path_list.extend(glob.glob(unsup_train))
train_path_list.extend(glob.glob(neg_test))
train_path_list.extend(glob.glob(pos_test))

train_corpus = list(read_docs(train_path_list))

## Create Doc2Vec model

In [None]:
def create_doc2vec_model(model_file, params):
    print "Creating Doc2Vec model with:"
    print "Distributed memory: " + str(params['dm'])
    print "Vector size: " + str(params['vector_size'])
    print "Min count: " + str(params['min_count'])
    print "Epochs: " + str(params['epochs'])
    print "Workers: " + str(4)
    print "Hierarchical softmax: " + str(params['hs'])
    print "Window size: " + str(params['window'])
    print "Negative sampling: " + str(params['negative'])
    print "..."

    model = Doc2Vec(seed=0, dbow_words=1, dm=params['dm'], vector_size=params['vector_size'],
                   min_count=params['min_count'], epochs=params['epochs'], workers=4,
                   hs=params['hs'], window=params['window'])
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    model.save(model_file)

    return model

## SVM using Doc2Vec

In [None]:
def svm_with_doc2vec(doc2vec_model_file, training_set, test_set):
    doc2vec_model = Doc2Vec.load(doc2vec_model_file)
        
    # Train the SVM model on the training set using Doc2Vec embeddings
    formatted_training_set = []

    for (sentiment, file_name, features) in training_set:
        vec = [(i+1, p) for i, p in enumerate(doc2vec_model.infer_vector(features))]
        sent_val = 1 if sentiment == "POS" else -1
        formatted_training_set.append((sent_val, vec))
      
    svm_model = svmlight.learn(formatted_training_set, type='classification')
   
    # Test the SVM model on the test set
    formatted_test_set = []
    
    for (sentiment, file_name, features) in test_set:
        vec = [(i+1, p) for i, p in enumerate(doc2vec_model.infer_vector(features))]
        formatted_test_set.append((0, vec))

    predictions = svmlight.classify(svm_model, formatted_test_set)

    # Format the predictions into truples of (predicted_sentiment, actual_sentiment, file_name)
    formatted_predictions = []
    idx = 0
    for (sentiment, file_name, features) in test_set:
        formatted_predictions.append(("POS" if predictions[idx] > 0 else "NEG", sentiment, file_name))
        idx += 1

    return formatted_predictions

## Round-Robin splitting

In [None]:
fold_count = 10

def round_robin_split(features):
    splits = []
    for i in range(0, fold_count):
        splits.append([])

    pos_features = features[0:(len(features)/2)]
    neg_features = features[(len(features)/2):len(features)]

    for i in range(0, (len(features)/2)):
        splits[i % fold_count].append(pos_features[i])
        splits[i % fold_count].append(neg_features[i])

    return splits

## Train SVM model using the first split as the validation set and a given Doc2Vec model

In [None]:
def svm_training(model_file):
    splits = round_robin_split(get_all_features())
    
    training_set = []
    validation_set = []
    
    for i in range(len(splits)):
        if i == 0:
            validation_set = splits[i]
        else:
            training_set.extend(splits[i])
            
    predictions = svm_with_doc2vec(model_file, training_set, validation_set)
    
    right_predictions = 0
    for i in range(len(predictions)):
        if predictions[i][0] == predictions[i][1]:
            right_predictions += 1
    
    # print [1 if predictions[i][0] == predictions[i][1] else 0 for i in range(len(predictions))]
    
    return float(right_predictions) / float(len(predictions))


## Grid search

In [None]:
def grid_search():
    dms = [0]
    vector_sizes = range(110,151)
    min_counts = range(0,16)
    epochss = range(10,21)
    hss = [0, 1]
    windows = range(10,21)
    negatives = range(5,21)
    i = 65
    with open("better.txt", "w+") as fp:
        for _ in range(20):
            model_file = './models/model' + str(i) + '.modelFile'
            params = {
                'dm': dms[0],
                'vector_size': vector_sizes[random.randint(0,len(vector_sizes)-1)],
                'min_count': min_counts[random.randint(0,len(min_counts)-1)],
                'epochs': epochss[random.randint(0,len(epochss)-1)],
                'hs': random.randint(0,1),
                'window': windows[random.randint(0,len(windows)-1)],
                'negative': negatives[random.randint(0,len(negatives)-1)]
            }
            create_doc2vec_model(model_file, params)
            accuracy = svm_training(model_file)
            print "Model " + str(i) + ": " + str(accuracy) + "\n\n"
            fp.write("Model " + str(i) + ":\n" + str(params) + "\nAccuracy: " + str(accuracy) + "\n\n")
            i += 1
            
#     with open("resultss.txt", "w+") as fp:
#         for dm in dms:
#             for vector_size in vector_sizes:
#                 for min_count in min_counts:
#                     for epochs in epochss:
#                         for hs in hss:
#                             for window in windows:
#                                 for negative in negatives:
#                                     model_file = './models/model' + str(i) + '.modelFile'
#                                     params = {
#                                         'dm': dm,
#                                         'vector_size': vector_size,
#                                         'min_count': min_count,
#                                         'epochs': epochs,
#                                         'hs': hs,
#                                         'window': window,
#                                         'negative': negative
#                                     }
#                                     create_doc2vec_model(model_file, params)
#                                     accuracy = svm_training(model_file)
#                                     print "Model " + str(i) + ": " + str(accuracy) + "\n\n"
#                                     fp.write("Model " + str(i) + ":\n" + str(params) + "\nAccuracy: " + str(accuracy) + "\n\n")
#                                     i -= 1

# grid_search()

## Remove validation set from the Pang et al. dataset

In [None]:
def remove_validation_set(data_set):
    splits = round_robin_split(data_set)
    
    # Remove the first split (the validation one) as we're not using it anymore
    remaining_pos = []
    remaining_neg = []
    for i in range(1,len(splits)):
        for (sentiment, file_name, features) in splits[i]:
            if sentiment == 'POS':
                remaining_pos.append((sentiment, file_name, features))
            else:
                remaining_neg.append((sentiment, file_name, features))
    return remaining_pos + remaining_neg

remaining_data_set = remove_validation_set(get_all_features())
smaller_splits = round_robin_split(remaining_data_set)

## 10-fold cross-validation on the 1800 Pang et al. reviews using SVM with unigrams + frequencies/presence

In [None]:
def predictions_for_index_unigrams(splits, index, presence):
    training_set = []
    test_set = []

    for i in range(0, len(splits)):
        if i == index:
            test_set = splits[i]
        else:
            training_set.extend(splits[i])

    return svm(training_set, test_set, presence)

def aggregate_predictions_unigrams(splits, presence):
    results = []
    for i in range(len(splits)):
        results.extend(predictions_for_index_unigrams(splits, i, presence))
    return results

def predictions_for_svm_unigrams(presence=False):
    return aggregate_predictions_unigrams(smaller_splits, presence)

def cross_validation_for_index(splits, index, presence):
    predictions = predictions_for_index_unigrams(splits, index, presence)

    right_predictions = 0
    for i in range(0, len(predictions)):
        if predictions[i][0] == predictions[i][1]:
            right_predictions += 1

    return float(right_predictions) / float(len(predictions))

def cross_validation(splits, presence):
    accuracies = []
    for i in range(0, len(splits)):
        accuracies.append(cross_validation_for_index(splits, i, presence))
    return accuracies

def svm_with_unigrams_cross_validate(presence=False):
    accuracies = cross_validation(smaller_splits, presence)
    if presence:
        print "SVM using unigrams & presence mean accuracy: " + str(statistics.mean(accuracies))
    else:
        print "SVM using unigrams & frequencies mean accuracy: " + str(statistics.mean(accuracies))

## 10-fold cross-validation on the 1800 Pang et al. reviews using the best SVM + Doc2Vec

In [None]:
def predictions_for_index_doc2vec(splits, index):
    training_set = []
    test_set = []

    for i in range(0, len(splits)):
        if i == index:
            test_set = splits[i]
        else:
            training_set.extend(splits[i])

    return svm_with_doc2vec('./models/model73.modelFile', training_set, test_set)

def aggregate_predictions_doc2vec(splits):
    results = []
    for i in range(len(splits)):
        results.extend(predictions_for_index_doc2vec(splits, i))
    return results

def predictions_for_svm_doc2vec():
    return aggregate_predictions_doc2vec(smaller_splits)

def cross_validation_for_index_doc2vec(splits, index):
    predictions = predictions_for_index_doc2vec(splits, index)

    right_predictions = 0
    for i in range(0, len(predictions)):
        if predictions[i][0] == predictions[i][1]:
            right_predictions += 1

    return float(right_predictions) / float(len(predictions))

def cross_validation_doc2vec(splits):
    accuracies = []
    for i in range(0, len(splits)):
        accuracies.append(cross_validation_for_index_doc2vec(splits, i))
    return accuracies

def svm_with_doc2vec_cross_validate():
    accuracies = cross_validation_doc2vec(smaller_splits)
    print "SVM using Doc2Vec mean accuracy: " + str(statistics.mean(accuracies))

## Permutation test

In [None]:
def compute_accuracy(predictions):
    # A prediction = a truple of the form (predicted_sentiment, actual_sentiment, file_name)
    right_predictions = 0.0
    for i in range(len(predictions)):
        if predictions[i][0] == predictions[i][1]:
            right_predictions += 1.0
    return right_predictions/len(predictions)

def perm_test(predictionsA, predictionsB, R):
    # Sort by file name, so the files match
    list.sort(predictionsA, key=lambda x: x[2])
    list.sort(predictionsB, key=lambda x: x[2])
    
    s = 0.0
    
    # Calculate original mean difference
    mean_difference = abs(compute_accuracy(predictionsA) - compute_accuracy(predictionsB))
    
    for _ in range(R):
        new_predictionsA = predictionsA
        new_predictionsB = predictionsB
        
        for i in range(len(new_predictionsA)):
            # Check that the 2 predictions are for the same file
            file_nameA = new_predictionsA[i][2]
            file_nameB = new_predictionsB[i][2]
            
            if file_nameA != file_nameB:
                print file_nameA + "and " + file_nameB + " don't match!"
                return
            
            # Randomly decide whether to swap the 2 predictions or not
            swap = random.randint(0,1)
            if swap:
                tmp = new_predictionsA[i]
                new_predictionsA[i] = new_predictionsB[i]
                new_predictionsB[i] = tmp
        
        # Calculate new mean difference
        new_mean_difference = abs(compute_accuracy(new_predictionsA) - compute_accuracy(new_predictionsB))
        
        if new_mean_difference >= mean_difference:
            s += 1.0
        
        # p-value
        return (s + 1.0)/(R + 1.0)

## Compare the 3 systems using permutation test

In [None]:
# svm_with_unigrams_cross_validate(presence=False)
# svm_with_unigrams_cross_validate(presence=True)
svm_with_doc2vec_cross_validate()

predictions_svm_freqs = predictions_for_svm_unigrams(presence=False)
predictions_svm_pres = predictions_for_svm_unigrams(presence=True)
predictions_svm_doc2vec = predictions_for_svm_doc2vec()

print "Comparing SVM + frequencies with SVM + doc2vec..."
print "P-value: " + str(perm_test(predictions_svm_freqs, predictions_svm_doc2vec, 5000))

print "Comparing SVM + presence with SVM + doc2vec..."
print "P-value: " + str(perm_test(predictions_svm_pres, predictions_svm_doc2vec, 5000))

print "Comparing SVM + frequencies with SVM + presence..."
print "P-value: " + str(perm_test(predictions_svm_freqs, predictions_svm_pres, 5000))

## Load & preprocess AMAZON datasets

In [None]:
categories = ['beauty', 'food', 'music', 'instruments', 'videos', 'games']

paths = [
    '../beauty.json',
    '../food.json',
    '../music.json',
    '../instruments.json',
    '../videos.json',
    '../games.json',
]

beauty_pos = []
beauty_neg = []
food_pos = []
food_neg = []
music_pos = []
music_neg = []
instruments_pos = []
instruments_neg = []
videos_pos = []
videos_neg = []
games_pos = []
games_neg = []

def load_amazon_datasets():
    
    for i in range(len(paths)):
        with open(paths[i]) as f:
            json_objs = f.read().split('\n')
            
            idx = 1
            for json_obj in json_objs:
                if json_obj != '':
                    data = json.loads(json_obj)
                    review_name = categories[i] + str(idx)
                    rating = float(data['overall'])
                    text = data['reviewText']
                    sent = 'NEUTRAL'
                    if rating == 5.0:
                        sent = 'POS'
                    if rating == 1.0:
                        sent = 'NEG'
                    features = simple_preprocess(text)
                    
                    if categories[i] == 'beauty':
                        if sent == 'POS':
                            beauty_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            beauty_neg.append((sent, review_name, features))
                    elif categories[i] == 'food':
                        if sent == 'POS':
                            food_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            food_neg.append((sent, review_name, features))
                    elif categories[i] == 'music':
                        if sent == 'POS':
                            music_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            music_neg.append((sent, review_name, features))
                    elif categories[i] == 'instruments':
                        if sent == 'POS':
                            instruments_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            instruments_neg.append((sent, review_name, features))
                    elif categories[i] == 'videos':
                        if sent == 'POS':
                            videos_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            videos_neg.append((sent, review_name, features))
                    elif categories[i] == 'games':
                        if sent == 'POS':
                            games_pos.append((sent, review_name, features))
                        elif sent == 'NEG':
                            games_neg.append((sent, review_name, features))
                    
                    idx += 1

def get_balanced_test_sets(pos_set, neg_set):
    return random.sample(pos_set, 100) + random.sample(neg_set, 100)

load_amazon_datasets()

## Train and predict phases on SVM + freq/pres & SVM + doc2vec

In [None]:
def get_trained_svm(training_set, presence):
    feature_indices = get_feature_indices(training_set)

    # Train the SVM model on the training set
    formatted_training_set = []

    for (sentiment, file_name, features) in training_set:
        feature_freqs = {}
        for w in features:
            if presence or w not in feature_freqs:
                feature_freqs[w] = 1
            else:
                feature_freqs[w] += 1

        feature_vec = []
        for word, count in feature_freqs.items():
            feature_vec.append((feature_indices[word], count))

        list.sort(feature_vec, key=lambda x: x[0])

        sent_val = 1 if sentiment == "POS" else -1
        formatted_training_set.append((sent_val, feature_vec))

    model = svmlight.learn(formatted_training_set, type='classification')
    return model

def test_svm(svm_model, training_set, test_set, presence):
    feature_indices = get_feature_indices(training_set + test_set)
    
    formatted_test_set = []
    
    for (sentiment, file_name, features) in test_set:
        feature_freqs = {}
        for w in features:
            if presence or w not in feature_freqs:
                feature_freqs[w] = 1
            else:
                feature_freqs[w] += 1

        feature_vec = []
        for word, count in feature_freqs.items():
            feature_vec.append((feature_indices[word], count))

        list.sort(feature_vec, key=lambda x: x[0])
        formatted_test_set.append((0, feature_vec))

    predictions = svmlight.classify(svm_model, formatted_test_set)

    # Format the predictions into truples of (predicted_sentiment, actual_sentiment, file_name)
    formatted_predictions = []
    idx = 0
    for (sentiment, file_name, features) in test_set:
        formatted_predictions.append(("POS" if predictions[idx] > 0 else "NEG", sentiment, file_name))
        idx += 1
    
    return formatted_predictions

def get_trained_svm_d2v(d2v_file, training_set):
    doc2vec_model = Doc2Vec.load(d2v_file)
        
    # Train the SVM model on the training set using Doc2Vec embeddings
    formatted_training_set = []

    for (sentiment, file_name, features) in training_set:
        vec = [(i+1, p) for i, p in enumerate(doc2vec_model.infer_vector(features))]
        sent_val = 1 if sentiment == "POS" else -1
        formatted_training_set.append((sent_val, vec))
      
    svm_model = svmlight.learn(formatted_training_set, type='classification')
    return svm_model, doc2vec_model

def test_svm_d2v(d2v_model, svm_model, test_set):
    formatted_test_set = []
    
    for (sentiment, file_name, features) in test_set:
        vec = [(i+1, p) for i, p in enumerate(d2v_model.infer_vector(features))]
        formatted_test_set.append((0, vec))

    predictions = svmlight.classify(svm_model, formatted_test_set)

    # Format the predictions into truples of (predicted_sentiment, actual_sentiment, file_name)
    formatted_predictions = []
    idx = 0
    for (sentiment, file_name, features) in test_set:
        formatted_predictions.append(("POS" if predictions[idx] > 0 else "NEG", sentiment, file_name))
        idx += 1

    return formatted_predictions

In [None]:
svm_freqs = get_trained_svm(remaining_data_set, presence=False)
svm_pres = get_trained_svm(remaining_data_set, presence=True)
svm_d2v, d2v_model = get_trained_svm_d2v('./models/model73.modelFile', remaining_data_set)
print "Trained all 3 classifiers!"

## Test the 3 systems on 200 balanced reviews of each category

In [None]:
def test_3_systems(data_set_pos, data_set_neg):
    acc1 = []
    acc2 = []
    acc3 = []
    freqs_pres = []
    freqs_doc = []
    pres_doc = []
    for _ in range(10):
        test_set = get_balanced_test_sets(data_set_pos, data_set_neg)
        preds_svm_freqs = test_svm(svm_freqs, remaining_data_set, test_set, presence=False)
        preds_svm_pres = test_svm(svm_pres, remaining_data_set, test_set, presence=True)
        preds_svm_doc2vec = test_svm_d2v(d2v_model, svm_d2v, test_set)
        freqs_pres.append(perm_test(preds_svm_freqs, preds_svm_pres, 5000))
        freqs_doc.append(perm_test(preds_svm_freqs, preds_svm_doc2vec, 5000))
        pres_doc.append(perm_test(preds_svm_pres, preds_svm_doc2vec, 5000))
        a1 = compute_accuracy(preds_svm_freqs)
        a2 = compute_accuracy(preds_svm_pres)
        a3 = compute_accuracy(preds_svm_doc2vec)
        acc1.append(a1)
        acc2.append(a2)
        acc3.append(a3)
    print "SVM+freqs: " + str(acc1)
    print "SVM+pres: " + str(acc2) 
    print "SVM+doc: " + str(acc3)
    print "freqs vs. pres: " + str(freqs_pres)
    print "freqs vs. doc: " + str(freqs_doc)
    print "pres vs. doc: " + str(pres_doc)
    print "\n"

print "Beauty:"
test_3_systems(beauty_pos, beauty_neg)

print "Grocery and Gourmet Food:"
test_3_systems(food_pos, food_neg)

print "Digital music:"
test_3_systems(music_pos, music_neg)

print "Musical Instruments:"
test_3_systems(instruments_pos, instruments_neg)

print "Amazon Instant Video:"
test_3_systems(videos_pos, videos_neg)

print "Video games:"
test_3_systems(games_pos, games_neg)