In [1]:
import numpy as np
import sys
import csv
import re
import scipy.sparse as sp
# method = sys.argv[1]
# learning_rate = int(sys.argv[2])
# iterations = int(sys.argv[3])
# batch_size = int(sys.argv[4])
# training_data_path = sys.argv[5]
# vocabulary_path = sys.argv[6]
# testing_data_path = sys.argv[7]
# output_path = sys.argv[8]

method = 1
learning_rate = 0.1
iterations = 100
batch_size = 128
training_data_path = '../data/imdb_train.csv'
vocabulary_path = '../data/imdb_vocab'
testing_data_path = '../data/imdb_test.csv'
output_path = 'out/imdb_output_a.txt'

In [62]:
def load(data_path,vocab_map,num_features):
    Y = []
    S = []
    regex = re.compile('[^a-zA-Z ]')
    with open(data_path, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        spamreader = list(spamreader)
        n = len(spamreader)
        S = sp.dok_matrix((n,num_features), dtype=np.int8)
        #X = np.zeros((n,num_features))
        Y = np.zeros((n,1))
        i = 0
        for row in spamreader:
            cleaned_text = regex.sub('',row[1])
            words = cleaned_text.split(' ')
            S[i,0] = 1
            for word in words:
                if word in vocab_map:
                    S[i,vocab_map[word]] += 1
            Y[i] = int(row[0])
            i += 1
    return S.tocsr(),Y

def load_data(data_path,vocab_map,num_features):
    X_sparse,Y = load(data_path,vocab_map,num_features)
    return X_sparse,Y

def load_vocab(data_path):
    vocab_map = {}
    with open(data_path, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')        
        i = 0
        for row in spamreader:
            if row[0] not in vocab_map:
                vocab_map[row[0]] = i + 1
                i += 1
    return vocab_map, i+1

def sigmoid(t):
    return 1.0 / (1.0 + np.exp(t * -1))

def get_log_likelihood(W,X,Y):
    X_W = X.dot(W)
    Predictions = sigmoid(X_W)
    return np.sum(Y.T.dot(np.log(Predictions)) + (1-Y.T).dot(np.log(1-Predictions)))/Y.shape[0]

def get_accuracy(W,X,Y):
    X_W = X.dot(W)
    Predictions = np.rint(sigmoid(X_W))
    return np.sum(Predictions == Y)/Y.shape[0]


def get_optimal_learning_rate(W,X,Y):
    return 0.01

def train_model(X,Y,iterations,learning_rate,lam,learning_rate_mode = 0):
    m = X.shape[1]
    n= X.shape[0]
    W = np.zeros((m,1))
    XT = X.transpose()
    for i in range(iterations):
        g_val = sigmoid(X.dot(W))
#         print(get_accuracy(W,X,Y))
        if (learning_rate_mode == 0):
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(learning_rate).dot(1/n)
        elif (learning_rate_mode == 1):
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(learning_rate / np.sqrt(i+1)).dot(1/n)
        else:
            lr = get_optimal_learning_rate(W,X,Y)
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(lr).dot(1/n)
    return W

def kFold_cross_validation(X,Y,lambdas,folds,iterations,learning_rate,learning_rate_mode):
    
    fold_size = int(X.shape[0]/folds)
    
    sums = []
    for lam in lambdas:
        sums.append(0.0)
        
    for i in range(folds):
        X_test = X[i*fold_size:(i+1)*fold_size]
        X_train = sp.vstack((X[:i*fold_size],X[(i+1)*fold_size:]))
        Y_test = Y[i*fold_size:(i+1)*fold_size]
        Y_train = np.vstack((Y[:i*fold_size],Y[(i+1)*fold_size:]))
        for i in range(len(lambdas)):
            W= train_model(X_train,Y_train,iterations,learning_rate,lambdas[i],learning_rate_mode)
#             sums[i] += get_log_likelihood(W,X_test,Y_test)
            sums[i] += get_accuracy(W,X_test,Y_test)
        print(sums)
    for i in range(0,len(sums)):
        sums[i] /= folds
    return sums

In [3]:
vocab_map, m = load_vocab(vocabulary_path)

In [4]:
# print(vocab_map)
# print(m)

In [5]:
X,Y = load_data(testing_data_path,vocab_map,m)

In [6]:
#print(X_train_sparse)
#print(Y_train)

In [7]:
print(X.shape)

(25000, 89528)


In [8]:
print(Y.shape)

(25000, 1)


In [63]:
# W = train_model(X,Y,iterations,learning_rate,0.01)
lambdas = [0,1,5,10,50,100,500,1000]
folds = 10
# W = train_model(X,Y,iterations,learning_rate,0.01,1)
accuracy = kFold_cross_validation(X,Y,lambdas,10,iterations,learning_rate,1)

[0.7228, 0.7228, 0.7228, 0.7228, 0.7224, 0.7224, 0.7228, 0.722]
[1.4716, 1.4716, 1.4716, 1.4716, 1.4712, 1.4712, 1.4712, 1.4704]
[2.2044, 2.2044, 2.2044, 2.2044, 2.204, 2.204, 2.2036000000000002, 2.2032]
[2.9468, 2.9468, 2.9468, 2.9468, 2.9464, 2.9464, 2.9464, 2.9452]
[3.6932, 3.6932, 3.6932, 3.6932, 3.6928, 3.6928, 3.6928, 3.69]
[4.4315999999999995, 4.4315999999999995, 4.4315999999999995, 4.4315999999999995, 4.4312000000000005, 4.4312000000000005, 4.430400000000001, 4.4276]
[5.17, 5.17, 5.17, 5.17, 5.169600000000001, 5.170000000000001, 5.1684, 5.1644]
[5.9064, 5.9064, 5.9064, 5.9064, 5.906000000000001, 5.9064000000000005, 5.9048, 5.8999999999999995]
[6.652799999999999, 6.652799999999999, 6.652799999999999, 6.652799999999999, 6.6524, 6.652800000000001, 6.6508, 6.6464]
[7.381599999999999, 7.381599999999999, 7.381599999999999, 7.381999999999999, 7.3816, 7.382000000000001, 7.38, 7.3748]


In [64]:
max = accuracy[0]
lam = lambdas[0]

for i in range (0,len(lambdas)):
    if (errors[i] > max):
        lam = lambdas[i]
        max = accuracy[i]
        
print(lam)

100
