In [11]:
import numpy as np
from collections import Counter
import random
import sys
from chp13 import *
np.random.seed(12345)

# dataset from http://www2.aueb.gr/users/ion/data/enron-spam/

import codecs
with codecs.open('data/spam.txt', "r",encoding='utf-8', errors='ignore') as fdata:
    raw = fdata.readlines()

vocab = set()
    
spam = list()
for row in raw:
    spam.append(set(row[:-2].split(" ")))
    for word in spam[-1]:
        vocab.add(word)
    
import codecs
with codecs.open('data/ham.txt', "r",encoding='utf-8', errors='ignore') as fdata:
    raw = fdata.readlines()

ham = list()
for row in raw:
    ham.append(set(row[:-2].split(" ")))
    for word in ham[-1]:
        vocab.add(word)
        
vocab.add("<unk>")

vocab = list(vocab)
w2i = {}
for i,w in enumerate(vocab):
    w2i[w] = i
    
def to_indices(input, l=500):
    indices = list()
    for line in input:
        if(len(line) < l):
            line = list(line) + ["<unk>"] * (l - len(line))
            idxs = list()
            for word in line:
                idxs.append(w2i[word])
            indices.append(idxs)
    return indices
            
spam_idx = to_indices(spam)
ham_idx = to_indices(ham)

train_spam_idx = spam_idx[0:-1000]
train_ham_idx = ham_idx[0:-1000]

test_spam_idx = spam_idx[-1000:]
test_ham_idx = ham_idx[-1000:]

train_data = list()
train_target = list()

test_data = list()
test_target = list()

for i in range(max(len(train_spam_idx),len(train_ham_idx))):
    train_data.append(train_spam_idx[i%len(train_spam_idx)])
    train_target.append([1])
    
    train_data.append(train_ham_idx[i%len(train_ham_idx)])
    train_target.append([0])
    
for i in range(max(len(test_spam_idx),len(test_ham_idx))):
    test_data.append(test_spam_idx[i%len(test_spam_idx)])
    test_target.append([1])
    
    test_data.append(test_ham_idx[i%len(test_ham_idx)])
    test_target.append([0])

In [12]:
def train(model, input_data, target_data, batch_size=500, iterations=5):
    
    criterion = MSELoss()
    optim = SGD(parameters=model.get_parameters(), alpha=0.01)
    
    n_batches = int(len(input_data) / batch_size)
    for iter in range(iterations):
        iter_loss = 0
        for b_i in range(n_batches):

            # padding token should stay at 0
            bs = batch_size
            model.weight.data[w2i['<unk>']] *= 0 
            input = Tensor(input_data[b_i*bs:(b_i+1)*bs], autograd=True)
            target = Tensor(target_data[b_i*bs:(b_i+1)*bs], autograd=True)

            pred = model.forward(input).sum(1).sigmoid()
            loss = criterion.forward(pred,target)
            loss.backward()
            optim.step()

            iter_loss += loss.data[0] / bs

            sys.stdout.write("\r\tLoss:" + str(iter_loss / (b_i+1)))
        print()
    return model

def test(model, test_input, test_output):
    
    model.weight.data[w2i['<unk>']] *= 0 
    
    input = Tensor(test_input, autograd=True)
    target = Tensor(test_output, autograd=True)

    pred = model.forward(input).sum(1).sigmoid()
    return ((pred.data > 0.5) == target.data).mean()

In [13]:
model = Embedding(vocab_size=len(vocab), dim=1)
model.weight.data *= 0
criterion = MSELoss()
optim = SGD(parameters=model.get_parameters(), alpha=0.01)

for i in range(3):
    model = train(model, train_data, train_target, iterations=1)
    print("% Correct on Test Set: " + \
          str(test(model, test_data, test_target)*100))

	Loss:0.037140416860871446
% Correct on Test Set: 98.65
	Loss:0.011258669226059114
% Correct on Test Set: 99.15
	Loss:0.008068268387986223
% Correct on Test Set: 99.45


# lets make it federated

In [17]:
bob = (train_data[0:1000], train_target[0:1000])
alice = (train_data[1000:2000], train_target[1000:2000])
sue = (train_data[2000:], train_target[2000:])
import copy

In [18]:
for i in range(3):
    print("Starting Training Round")
    print("\tStep 1: send the model to bob")
    bob_model = train(copy.deepcopy(model),bob[0],bob[1],iterations=1)
    
    print("\n\tStep 2: send the model to alice")
    alice_model = train(copy.deepcopy(model),
                       alice[0], alice[1], iterations=1)
    
    print("\n\tStep 3: send the model to sue")
    sue_model = train(copy.deepcopy(model),sue[0],sue[1], iterations=1)
    
    print("\n\tAverage everyones new models")
    model.weight.data = (bob_model.weight.data + alice_model.weight.data+
                        sue_model.weight.data) / 3
    print("\t% Correct on Test set: "+
         str(test(model,test_data,test_target) * 100))

Starting Training Round
	Step 1: send the model to bob
	Loss:0.007924345075144994

	Step 2: send the model to alice
	Loss:0.007990960300900699

	Step 3: send the model to sue
	Loss:0.0063886639830050015

	Average everyones new models
	% Correct on Test set: 99.5
Starting Training Round
	Step 1: send the model to bob
	Loss:0.007331416633817908

	Step 2: send the model to alice
	Loss:0.007292494659027588

	Step 3: send the model to sue
	Loss:0.0060201301162778244

	Average everyones new models
	% Correct on Test set: 99.6
Starting Training Round
	Step 1: send the model to bob
	Loss:0.006884940668687032

	Step 2: send the model to alice
	Loss:0.006776291525625332

	Step 3: send the model to sue
	Loss:0.0056944688964336195

	Average everyones new models
	% Correct on Test set: 99.65
