In [1]:
import numpy as np
import sys
import csv
import re
import scipy.sparse as sp
# method = sys.argv[1]
# learning_rate = int(sys.argv[2])
# iterations = int(sys.argv[3])
# batch_size = int(sys.argv[4])
# training_data_path = sys.argv[5]
# vocabulary_path = sys.argv[6]
# testing_data_path = sys.argv[7]
# output_path = sys.argv[8]

method = 1
learning_rate = 0.1
iterations = 100
batch_size = 128
training_data_path = '../data/imdb_train.csv'
vocabulary_path = '../data/imdb_vocab'
testing_data_path = '../data/imdb_test.csv'
output_path = 'out/imdb_output_a.txt'

In [90]:
def load(data_path,vocab_map,num_features):
    Y = []
    S = []
    regex = re.compile('[^a-zA-Z ]')
    with open(data_path, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        spamreader = list(spamreader)
        n = len(spamreader)
        S = sp.dok_matrix((n,num_features), dtype=np.int8)
        #X = np.zeros((n,num_features))
        Y = np.zeros((n,1))
        i = 0
        for row in spamreader:
            cleaned_text = regex.sub('',row[1])
            words = cleaned_text.split(' ')
            S[i,0] = 1
            for word in words:
                if word in vocab_map:
                    S[i,vocab_map[word]] += 1
            Y[i] = int(row[0])
            i += 1
    return S.tocsr(),Y

def load_data(data_path,vocab_map,num_features):
    X_sparse,Y = load(data_path,vocab_map,num_features)
    return X_sparse,Y

def load_vocab(data_path):
    vocab_map = {}
    with open(data_path, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')        
        i = 0
        for row in spamreader:
            if row[0] not in vocab_map:
                vocab_map[row[0]] = i + 1
                i += 1
    return vocab_map, i+1

def sigmoid(t):
    return 1.0 / (1.0 + np.exp(t * -1))

def get_log_likelihood(W,X,Y):
    X_W = X.dot(W)
    Predictions = sigmoid(X_W)
    return np.sum(Y.T.dot(np.log(Predictions)) + (1-Y.T).dot(np.log(1-Predictions)))

def get_optimal_learning_rate(W,X,Y):
    return 0.01

def train_model(X,Y,iterations,learning_rate,lam,learning_rate_mode = 0):
    m = X.shape[1]
    n= X.shape[0]
    W = np.zeros((m,1))
    XT = X.transpose()
    for i in range(iterations):
        g_val = sigmoid(X.dot(W))
        print(np.sum(Y-g_val))
#         print(get_log_likelihood(W,X,Y))
        if (learning_rate_mode == 0):
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(learning_rate).dot(1/n)
        elif (learning_rate_mode == 1):
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(learning_rate / np.sqrt(i+1)).dot(1/n)
        else:
            lr = get_optimal_learning_rate(W,X,Y)
            W = W + (XT.dot(Y-g_val) - W.dot(lam)).dot(lr).dot(1/n)
    return W

def kFold_cross_validation(X,Y,lambdas,folds,iterations,learning_rate,learning_rate_mode):
    
    fold_size = int(X.shape[0]/folds)
    
    sums = []
    
    CV_test_X, CV_train_X = X[:fold_size,:].copy(), X[fold_size:,:].copy()
    CV_test_Y, CV_train_Y = Y[:fold_size,:].copy(), Y[fold_size:,:].copy()
#     CV_test_X, CV_train_X = np.split(X.copy(), [fold_size], axis=0)
#     CV_test_Y, CV_train_Y = np.split(Y.copy(), [fold_size], axis=0)
    
#     print(CV_test_X,CV_test_Y)
    for lam in lambdas:
        W= train_model(CV_train_X,CV_train_Y,iterations,learning_rate,learning_rate_mode)
        sums.append(get_log_likelihood(W,CV_test_X,CV_test_Y))

    for i in range (0,folds-1):
        if i==folds-2:
            CV_train_X[i*fold_size:(i+1)*fold_size],CV_test_X = CV_test_X, CV_train_X[i*fold_size:]
            CV_train_Y[i*fold_size:(i+1)*fold_size],CV_test_Y = CV_test_Y, CV_train_Y[i*fold_size:]
        else:
            CV_train_X[i*fold_size:(i+1)*fold_size],CV_test_X = CV_test_X, CV_train_X[i*fold_size:(i+1)*fold_size]
            CV_train_Y[i*fold_size:(i+1)*fold_size],CV_test_Y = CV_test_Y, CV_train_Y[i*fold_size:(i+1)*fold_size]

        for i in range(0, len(lambdas)):
            W= train_model(CV_train_X,CV_train_Y,iterations,learning_rate,lam,learning_rate_mode)
            sums[i] += get_log_likelihood(W,CV_test_X,CV_test_Y)
#         print(sums)
    for i in range(0,len(sums)):
        sums[i] /= folds
    return sums


In [3]:
vocab_map, m = load_vocab(vocabulary_path)

In [4]:
# print(vocab_map)
# print(m)

In [5]:
X,Y = load_data(testing_data_path,vocab_map,m)

In [6]:
#print(X_train_sparse)
#print(Y_train)

In [7]:
print(X.shape)

(25000, 89528)


In [8]:
print(Y.shape)

(25000, 1)


In [None]:
# W = train_model(X,Y,iterations,learning_rate,0.01)
lambdas = [0,0.01,0.1,0.5,1,10,100]
folds = 10
# W = train_model(X,Y,iterations,learning_rate,0.01,1)
print(kFold_cross_validation(X,Y,lambdas,10,iterations,learning_rate,1))

31.0
-1955.4555847472616
9880.566218402428
-11071.69099246346
10000.544399961542
-11059.528778932403
10038.694041940922
-11045.481864385361
10040.248422416611
-11027.935707784296
10024.136410251296
-11005.224659979744
10001.39836846183
-10975.375105426287
9979.130845845102
-10936.021400962196
9960.741517238077
-10884.550326565986
9945.762465883869
-10818.53083311071
9930.853828722917
-10736.317488448436
9911.81433018949
-10637.481942215083
9885.33306142874
-10522.75997001286
9849.582327064832
-10393.880206826469
9804.180864944454
-10253.587940928777
9750.072961519198
-10105.051998380328
9688.58035775926
-9951.229118404928
9620.560490311109
-9794.903434530519
9546.833454116146
-9639.973837665417
9471.180391583814
-9489.68779046196
9395.43967044013
-9346.549030331764
9320.500771382161
-9212.154374037185
9247.212252449244
-9087.359317687635
9176.367449732967
-8972.471843264455
9108.55225635289
-8867.370592295338
9044.006432046326
-8771.648766805964
8982.78084391449
-8684.783909430982
8925

-8972.471843264455
9108.55225635289
-8867.370592295338
9044.006432046326
-8771.648766805964
8982.78084391449
-8684.783909430982
8925.00786502442
-8606.236942602502
8870.948821616985
-8535.44420019093
8820.808911320672
-8471.779144836499
8774.589316460504
-8414.543998811412
8732.053529979281
-8362.995419871251
8692.781118375471
-8316.391450838835
8656.275457981388
-8274.0396690522
8622.058420540794
-8235.326507786835
8589.716708715234
-8199.724306719207
8558.914574803817
-8166.786350217435
8529.398427621436
-8136.140549975209
8501.004146974205
-8107.486207429654
8473.658333473886
-8080.592775919005
8447.355592475815
-8055.295040323594
8422.108106074107
-8031.472920462727
8397.892691191526
-8009.012132498319
8374.629352662176
-7987.771101283077
8352.195736908463
-7967.579762448489
8330.453302088132
-7948.260255644244
8309.266424082565
-7929.646841168369
8288.512126000296
-7911.59640202429
8268.084619816931
-7893.991775169037
8247.898272296137
-7876.741742250286
31.0
-1955.4555847472616
9