In [2]:
import re
import os
import math
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import random
import numpy
import pandas as pd
from gensim.models import KeyedVectors
from numpy import array

In [8]:
vec_file = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=50000)

In [11]:
import pickle
pickle.dump(vec_file, open('vectors.sav', 'wb'))

In [9]:
def clean_and_tokenize_file(file_name):
    file = open(file_name, encoding='latin-1')
    raw_text= file.read()
    words = raw_text.split()
    words = [x for x in words if re.fullmatch('\w+', x)]
    return words


def word2vec_vector(word2vec, review, tag):
    tokenized_review = clean_and_tokenize_file(review)
    vec_list = []
    for word in tokenized_review:
        if word in word2vec:
            vec_list.append(word2vec[word])
    vector_array = numpy.array(vec_list)
    if tag == 0:
        return numpy.mean(vector_array, axis=0)
    else:
        return numpy.mean(vector_array, axis=0), tag


def get_all_word2vec_vectors(pos_dir, neg_dir, word2vec):
    pos_features = [word2vec_vector(word2vec, pos_dir + file, 1) for file in os.listdir(pos_dir) if file != '.DS_Store']
    neg_features = [word2vec_vector(word2vec, neg_dir + file, -1) for file in os.listdir(neg_dir) if file != '.DS_Store']
    feature_vectors = pos_features + neg_features
    random.shuffle(feature_vectors)
    return feature_vectors



In [12]:
vectors = get_all_word2vec_vectors('train/pos/', 'train/neg/', vec_file)

In [13]:
def train_mlp(vectors, first_layer, alpha_value, activation_function):
    x = [feature[0] for feature in vectors]
    y = [feature[1] for feature in vectors]
    model = MLPClassifier(hidden_layer_sizes=(first_layer,10),alpha=alpha_value,activation=activation_function)
    model.fit(x, y)
    return model

def test_mlp(model, test):
    test_x = array([feature[0] for feature in test])
    test_y = array([feature[1] for feature in test])
    predictions = model.predict(test_x)
    return accuracy_score(test_y,predictions)

def cross_validate(features, num_chunks, first_layer, alpha_value, activation_function):
    chunk_size = math.floor(len(features) / num_chunks)
    accuracy = 0
    print(len(features))
    for i in range(num_chunks):
        if i == 0:
            train_set = features[chunk_size:]
            test_set = features[:chunk_size]
        elif i == num_chunks - 1:
            train_set = features[:i * chunk_size]
            test_set = features[i * chunk_size:]
        else:
            train_set = features[:i*chunk_size] + features[(i+1)*chunk_size:]
            test_set = features[i*chunk_size:(i+1)*chunk_size]     
        model = train_mlp(train_set, first_layer, alpha_value, activation_function)
        model_accuracy = test_mlp(model, test_set)
        accuracy += model_accuracy
    return accuracy / num_chunks

def grid_search(output_file, features, first_layer_options, alpha_value_options, activation_function_options):
    file = open(output_file, 'w')
    for first_layer in first_layer_options:
        for alpha_value in alpha_value_options:
            for activation_function in activation_function_options:
                accuracy = cross_validate(features, 10, first_layer, alpha_value, activation_function)
                print('ACCURACY IS {0} FOR FIRST LAYER {1}, ALPHA VALUE {2}, ACTIVATION FUNCTION {3}'.format(accuracy, first_layer, alpha_value, activation_function))
                file.write('\nACCURACY IS {0} FOR FIRST LAYER {1}, ALPHA VALUE {2}, ACTIVATION FUNCTION {3}'.format(accuracy, first_layer, alpha_value, activation_function))
    file.close()



In [8]:
alphas = [0.01, 0.1, 1]
hidden_layers = [5, 10, 15, 20]
activations = ['tanh', 'relu', 'logistic']

grid_search('grid_search_results.txt', vectors, hidden_layers, alphas, activations)

25000
ACCURACY IS 0.8417200000000001 FOR FIRST LAYER 5, ALPHA VALUE 0.01, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.8423200000000002 FOR FIRST LAYER 5, ALPHA VALUE 0.01, ACTIVATION FUNCTION relu
25000




ACCURACY IS 0.84152 FOR FIRST LAYER 5, ALPHA VALUE 0.01, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8402000000000001 FOR FIRST LAYER 5, ALPHA VALUE 0.1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.84084 FOR FIRST LAYER 5, ALPHA VALUE 0.1, ACTIVATION FUNCTION relu
25000




ACCURACY IS 0.7738 FOR FIRST LAYER 5, ALPHA VALUE 0.1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8346 FOR FIRST LAYER 5, ALPHA VALUE 1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.8354800000000001 FOR FIRST LAYER 5, ALPHA VALUE 1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.4936 FOR FIRST LAYER 5, ALPHA VALUE 1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.84016 FOR FIRST LAYER 10, ALPHA VALUE 0.01, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.8418800000000001 FOR FIRST LAYER 10, ALPHA VALUE 0.01, ACTIVATION FUNCTION relu
25000




ACCURACY IS 0.84052 FOR FIRST LAYER 10, ALPHA VALUE 0.01, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8428799999999999 FOR FIRST LAYER 10, ALPHA VALUE 0.1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.84016 FOR FIRST LAYER 10, ALPHA VALUE 0.1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.8093999999999999 FOR FIRST LAYER 10, ALPHA VALUE 0.1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8354799999999999 FOR FIRST LAYER 10, ALPHA VALUE 1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.83476 FOR FIRST LAYER 10, ALPHA VALUE 1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.49440000000000006 FOR FIRST LAYER 10, ALPHA VALUE 1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8402 FOR FIRST LAYER 15, ALPHA VALUE 0.01, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.8411199999999999 FOR FIRST LAYER 15, ALPHA VALUE 0.01, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.84128 FOR FIRST LAYER 15, ALPHA VALUE 0.01, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8407199999999999 FOR FIRST LAYER 15, ALPHA VALUE 0.1, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.8453999999999999 FOR FIRST LAYER 15, ALPHA VALUE 0.1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.80404 FOR FIRST LAYER 15, ALPHA VALUE 0.1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8346 FOR FIRST LAYER 15, ALPHA VALUE 1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.8311200000000001 FOR FIRST LAYER 15, ALPHA VALUE 1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.4973599999999999 FOR FIRST LAYER 15, ALPHA VALUE 1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8420400000000001 FOR FIRST LAYER 20, ALPHA VALUE 0.01, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.8398399999999999 FOR FIRST LAYER 20, ALPHA VALUE 0.01, ACTIVATION FUNCTION relu
25000




ACCURACY IS 0.842 FOR FIRST LAYER 20, ALPHA VALUE 0.01, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.8398 FOR FIRST LAYER 20, ALPHA VALUE 0.1, ACTIVATION FUNCTION tanh
25000




ACCURACY IS 0.84368 FOR FIRST LAYER 20, ALPHA VALUE 0.1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.8386800000000001 FOR FIRST LAYER 20, ALPHA VALUE 0.1, ACTIVATION FUNCTION logistic
25000
ACCURACY IS 0.82944 FOR FIRST LAYER 20, ALPHA VALUE 1, ACTIVATION FUNCTION tanh
25000
ACCURACY IS 0.83536 FOR FIRST LAYER 20, ALPHA VALUE 1, ACTIVATION FUNCTION relu
25000
ACCURACY IS 0.49832 FOR FIRST LAYER 20, ALPHA VALUE 1, ACTIVATION FUNCTION logistic


In [14]:
model = train_mlp(vectors, 15, 0.1, 'relu')

In [15]:
test_features = get_all_word2vec_vectors('test/pos/', 'test/neg/', vec_file)

In [16]:
test_scores = test_mlp(model, test_features)
print(test_scores)

0.83724


In [12]:
import pickle
pickle.dump(model, open('model.sav', 'wb'))