In [1]:
########################################################
# this imports all the functions in logistic regression
# you should be able to run this cell at any time to 
# "reload" the functions
########################################################

from logistic_regression import *

In [2]:
########################################################
# prepare and load the training data.  this involves
# reading in the data and finding the best features
########################################################

# if cant find stopwords you can download using this:
# import nltk
# nltk.download('stopwords')

# init
stop_words = stopwords.words('english')
train_data = 'train.csv'
# test_data = 'test.csv'

# interpret data
messages, text_labels = read_spam_data(train_data)
all_train_data = create_train_data(messages, stop_words)
int_labels = create_spam_ham_labels(text_labels, spam=1, ham=0)

# get sizes
n_messages = len(messages)
n_features = all_train_data.shape[1]

# make sure everything is still aligned
print(all_train_data.shape)
assert all_train_data.shape[0] == len(messages)
assert all_train_data.shape[0] == len(int_labels)
assert all_train_data.shape[0] == len(text_labels)

(3000, 6179)


In [3]:
########################################################
# this is the definition of the hyper parameters for
# the regression
########################################################

# lambda
lambda_base = 8 #np.e
lambda_exp_min = -4
lambda_exp_max = 1
list_of_lambdas = [lambda_base**i for i in range(lambda_exp_min,lambda_exp_max+1)] 
print("LAMBDAS:\n\tbase: {}   log_min: {}   log_max: {}\n\t{}".format( 
      lambda_base, lambda_exp_min, lambda_exp_max, list_of_lambdas))

# sigmoid params
eta_0 = 0.1
alpha = 0.9

LAMBDAS:
	base: 8   log_min: -4   log_max: 1
	[0.000244140625, 0.001953125, 0.015625, 0.125, 1, 8]


In [4]:
########################################################
# divide the data for 10-fold cross validation
########################################################

# identifiers
TRAIN_DATA = "t_data"
TRAIN_LABELS = "t_labels"
VALIDATE_DATA = "v_data"
VALIDATE_LABELS = "v_labels"

# prep
number_of_buckets = 10
size_of_bucket = int(n_messages / number_of_buckets)
all_train_buckets = dict()

# divide into buckets
idx = 0
for b in range(number_of_buckets):
    data = all_train_data[idx:idx+size_of_bucket]
    labels = int_labels[idx:idx+size_of_bucket]
    all_train_buckets[b] = [data, labels]
    idx += size_of_bucket
    
    
# how to create train and validation data sets
def get_train_data_set(idx):
    t_data, t_labels = list(), list()
    v_data, v_labels = None, None
    for k in all_train_buckets.keys():
        v = all_train_buckets[k]
        if k == idx:
            v_data = v[0]
            v_labels = v[1]
        else:
            t_data.append(v[0])
            t_labels.append(v[1])
    return {
        TRAIN_DATA: np.vstack(t_data), 
        TRAIN_LABELS: np.hstack(t_labels),  
        VALIDATE_DATA: v_data, 
        VALIDATE_LABELS: v_labels
    }

# get data
all_training_datasets = [get_train_data_set(x) for x in list(range(number_of_buckets))]

# validation
assert len(all_training_datasets) == number_of_buckets
for ds in all_training_datasets:
    assert ds[TRAIN_DATA].shape[1] == n_features
    assert ds[TRAIN_DATA].shape[0] == len(ds[TRAIN_LABELS])
    assert ds[VALIDATE_DATA].shape[1] == n_features
    assert ds[VALIDATE_DATA].shape[0] == len(ds[VALIDATE_LABELS])

In [5]:
########################################################
# definition of our regression function
########################################################


def run_regression(lamda, train, train_labels, validate, validate_labels, 
                   eta_0=0.1, alpha=0.9, iterations=321, verbose=False):
    # init
    report_frequency = int(iterations / 16.0)
    t = None
    
    #run regression
    try:
        weights = np.random.normal(0, 0.2, n_features)
        low_val_loss = sys.maxsize
        for t in range(iterations):
            weights = logistic_regression(train, train_labels, weights, lamda, alpha, eta_0, t)
            val_loss = square_loss(validate, validate_labels, weights=weights)
            train_loss = square_loss(train, train_labels, weights=weights)
            if verbose and t % report_frequency == 0:
                print("{}:\t#{}\ttrain {}  \t\tvalidate {}".format(l, t, train_loss, val_loss))
            if val_loss < low_val_loss:
                low_val_loss = val_loss
    except Exception as e:
        print("\nlambda {} #{}: {}".format(l, t, e), sys.stderr)
        import traceback
        traceback.print_exc()
        return False
    
    # return best
    return low_val_loss

In [6]:
########################################################
# run on each of our k-folded datasets
########################################################

# prep
lambda_to_errors = dict()

# calculate for our lambdas
for l in list_of_lambdas:
    print("\nLAMBDA: {}\n\t".format(l),end='')
    errors = list()
    for dataset in all_training_datasets:
        e = run_regression(l, dataset[TRAIN_DATA], dataset[TRAIN_LABELS], 
                       dataset[VALIDATE_DATA], dataset[VALIDATE_LABELS])
        errors.append(e)
        print('.', end='')
    print("\n\terrors:    {}".format(errors))
    print("\terror avg: {}".format(np.mean(errors)))
    print("\terror std: {}".format(np.std(errors)))


LAMBDA: 0.000244140625
	..........
	errors:    [3.7355176893339728, 5.269783583164283, 4.558848168374628, 4.469990543346688, 5.077990762524534, 4.987165129508705, 3.0488114855260116, 3.410443527928658, 3.2449748800933036, 2.6816344710054514]
	error avg: 4.048516024080623
	error std: 0.8882010854248966

LAMBDA: 0.001953125
	..........
	errors:    [3.825441291364687, 5.349313735793666, 4.450388432297149, 4.3081310495184475, 5.086269106955666, 4.941075156542861, 3.135591986157624, 3.4347068019070974, 3.454821895318585, 2.7119899727013848]
	error avg: 4.0697729428557174
	error std: 0.8478667219485675

LAMBDA: 0.015625
	..........
	errors:    [3.4201302163685106, 4.841267378780757, 4.659595267762334, 4.482733399099036, 5.003941121606217, 4.845880996699068, 3.482296261424133, 2.6593485288962055, 1.972034617212043, 2.8702568342404122]
	error avg: 3.823748462208872
	error std: 1.0285509395842205

LAMBDA: 0.125
	..........
	errors:    [5.3748025811574704, 5.525629315928193, 5.74515952529075, 5

  new_weights = weights*(1-(eta*l)) - (eta*np.matmul(np.array(y_hat-labels), inputs))


..........
	errors:    [20.586098017995305, 19.729096328849796, 17.862322939955888, 17.475216469670702, 19.552802565609475, 18.682246593415172, 16.659846953034688, 18.852758454994675, 20.13153334733673, 16.0]
	error avg: 18.553192167086245
	error std: 1.4441299064518551
