In [1]:
########################################################
# this imports all the functions in logistic regression
# you should be able to run this cell at any time to 
# "reload" the functions
########################################################

from logistic_regression import *

In [2]:
########################################################
# prepare and load the training data.  this involves
# reading in the data and finding the best features
########################################################

# if cant find stopwords you can download using this:
# import nltk
# nltk.download('stopwords')

stop_words = stopwords.words('english')
# test_data = 'test.csv'
train_data = 'train.csv'

messages, text_labels = read_spam_data(train_data)
all_train_data = create_train_data(messages, stop_words)
int_labels = create_spam_ham_labels(text_labels, spam=1, ham=0)
print(all_train_data.shape)
# make sure everything is still aligned
assert all_train_data.shape[0] == len(messages)
assert all_train_data.shape[0] == len(int_labels)
assert all_train_data.shape[0] == len(text_labels)
n_features = all_train_data.shape[1]

(3000, 6179)


In [3]:
########################################################
# this is the definition of the hyper parameters for
# the regression
########################################################
lambda_base = 8 #np.e
lambda_exp_min = -4
lambda_exp_max = 1

eta_0 = 0.1
alpha = 0.9
# l is the lambda (regularizer)
list_of_lambdas = [lambda_base**i for i in range(lambda_exp_min,lambda_exp_max+1)] #np.linspace(0, .07, 5)
print(list_of_lambdas)

[0.000244140625, 0.001953125, 0.015625, 0.125, 1, 8]


In [4]:
########################################################
# divide the data for 10-fold cross validation
########################################################

# todo

X = all_train_data

train = X[:2000]
train_labels = int_labels[:2000]
val = X[2000:]
val_labels = int_labels[2000:]

In [5]:
########################################################
# run with the current set of lambdas to test
########################################################
l, t = None, None
bests = []
try:
    for l in list_of_lambdas:
        print("\nLAMBDA: {}".format(l))
        weights = np.random.normal(0, 0.2, n_features)
        low_val_loss = 100
        for t in range(321):
            weights = logistic_regression(train, train_labels, weights, l, alpha, eta_0, t)
            val_loss = square_loss(val, val_labels, weights=weights)
            train_loss = square_loss(train, train_labels, weights=weights)
            if t % 16 == 0:
                print("{}:\t#{}\ttrain {}  \t\tvalidate {}".format(l, t, train_loss, val_loss))
            if val_loss < low_val_loss:
                low_val_loss = val_loss
        bests.append([l, low_val_loss])
except Exception as e:
    print("\n{} #{}: {}".format(l, t, e), sys.stderr)
    import traceback
    traceback.print_exc()
    
print("\nBest Values: ")
for best in bests:
    print("\tLambda:{}\tError:{}".format(best[0], best[1]))
print("Overall Best:")
min(bests, key=lambda x: x[1])


LAMBDA: 0.000244140625
0.000244140625:	#0	train 250.48026711675016  		validate 126.40091458659214
0.000244140625:	#16	train 2.8068480093914543  		validate 13.82272832135285
0.000244140625:	#32	train 0.8384750328269709  		validate 11.993340714386788
0.000244140625:	#48	train 0.6768217916261067  		validate 11.431372739486017
0.000244140625:	#64	train 0.6444004183551983  		validate 11.16495495444631
0.000244140625:	#80	train 0.6343724518098279  		validate 11.013565044162604
0.000244140625:	#96	train 0.6303723237392387  		validate 10.918903861893746
0.000244140625:	#112	train 0.6284866233204475  		validate 10.856349756350882
0.000244140625:	#128	train 0.6274876196739667  		validate 10.813646555633442
0.000244140625:	#144	train 0.6269118704887062  		validate 10.783958357867306
0.000244140625:	#160	train 0.6265590720886458  		validate 10.763134032953237
0.000244140625:	#176	train 0.6263331227333511  		validate 10.748480280851402
0.000244140625:	#192	train 0.6261838694430802  		validate 10.7

  new_weights = weights*(1-(eta*l)) - (eta*np.matmul(np.array(y_hat-labels), inputs))


8:	#208	train nan  		validate nan
8:	#224	train nan  		validate nan
8:	#240	train nan  		validate nan
8:	#256	train nan  		validate nan
8:	#272	train nan  		validate nan
8:	#288	train nan  		validate nan
8:	#304	train nan  		validate nan
8:	#320	train nan  		validate nan

Best Values: 
	Lambda:0.000244140625	Error:10.710101796153054
	Lambda:0.001953125	Error:11.145609911811919
	Lambda:0.015625	Error:10.566790054302702
	Lambda:0.125	Error:18.168692686506464
	Lambda:1	Error:37.88205385617447
	Lambda:8	Error:72.59097630096764
Overall Best:


[0.015625, 10.566790054302702]