## Part 2 - Federated Learning NLP 
### Split Daily Mail data into topics


In [1]:
import numpy as np
import json
import pickle

import os
#os.chdir("../..")
!!pwd

['/Users/austin.bellibm.com/Documents/FederatedLearning/Part 3 - Applied NLP']

In [13]:
# read in daily mail data - using prepocessed versions
with open('./RawData/data/train.json') as f:
    train = [json.loads(line) for line in f]
    train = np.array(train)
    train_docs, train_labels = zip(*[(line['doc'], line['labels']) for line in train])
    
with open('./RawData/data/test.json') as f:
    test = [json.loads(line) for line in f]
    test = np.array(test)
    test_docs, test_labels = zip(*[(line['doc'], line['labels']) for line in test])
    
# read in nb-svm model
with open('./Models/20ng_topicModel', 'rb') as model:
    clf20 = pickle.load(model)

In [14]:
len(train), len(test)

(193983, 10350)

In [15]:
# The downloaded dataset contains wrongly encoded characters which appear as "â\x80\x98" - I will remove these
train_docs = list(map(lambda train: train.encode('ascii', errors='ignore'), train_docs))
test_docs = list(map(lambda test: test.encode('ascii', errors = 'ignore'), test_docs))

Run Topic model on Daily Mail Data (need to do for both train and test - turn into a function to save memory)

In [16]:
def get_labels(clf20, docs, score):
    # get predictions
    preds = clf20.run_nbsvm(docs) # all topics

    # identify max
    max_labels, threshold_vec = clf20.select_max_labels(preds, ret_score_threshold = True, score_threshold = score)

    # convert labels that do not meet threshold to other category
    max_labels = np.where(threshold_vec == 0, len(clf20.topics)-1, max_labels)
    
    return max_labels

train_max_labels = get_labels(clf20, train_docs, .5)
test_max_labels = get_labels(clf20, test_docs, .5)


...Generating TF-IDF Matrices...

...Fitting models...
alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc

...Generating TF-IDF Matrices...

...Fitting models...
alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc


In [17]:
# let's see some of our topics
uniques, counts = np.unique(test_max_labels, return_counts=True)
print(dict(zip(uniques, counts)))

# example of categorized topic
idx13 = np.where(test_max_labels == 13)[0][0]
print(idx13)
print(clf20.topics[13])
print(test_docs[int(idx13)])
print("\n")

idx10 = np.where(test_max_labels == 17)[0][0]
print(clf20.topics[17])
print(test_docs[idx10])

{7: 96, 8: 17, 9: 63, 10: 312, 11: 6, 12: 2, 13: 372, 14: 97, 15: 32, 16: 71, 17: 29, 18: 35, 20: 9218}
17
sci.med
b"breast cancer is the second most common cause of death from cancer among american women the american spends $ 4billion a year on unnecessary medical costs due to mammograms generating false alarms and on treatment for breast tumors unlikely to cause problems , a new report has revealed\nthe study , published in the health affairs journal on monday , has estimated the figure for women aged 40 to 59\nit is made up of $ 2.8bn resulting from false - positive mammograms and another $ 1.2bn attributed to breast cancer overdiagnosis - treatment of tumors that grow slowly or not at all and are unlikely to develop into life - threatening disease\nbreast cancer is the second most common cause of death from cancer among american women , claiming nearly 41,000 lives a year\nannual mammograms starting at 40 years old have long been considered standard for preventive care , because ca

Split and Separate the train and test data into their respective folder

In [23]:
split = {'Vendor':[0,5], # Aggregator will also include Other - 20
         'Buyer1':[6,11],
         'Buyer2':[12,19]}

def split_parties(data, max_labels, split):
    # vendor
    vendor_vec = np.where((max_labels >= split['Vendor'][0]) & (max_labels <= split['Vendor'][1]) | 
                          max_labels == len(clf20.topics)-1) # Other
    vendor = data[vendor_vec]
    
    # buyers
    buyer1_vec = np.where((max_labels >= split['Buyer1'][0]) & (max_labels <= split['Buyer1'][1]))
    buyer1 = data[buyer1_vec]
    
    buyer2_vec = np.where((max_labels >= split['Buyer2'][0]) & (max_labels <= split['Buyer2'][1]))
    buyer2 = data[buyer2_vec]
    
    return vendor, buyer1, buyer2

train_vendor, train_buyer1, train_buyer2 = split_parties(train, train_max_labels, split)
test_vendor, test_buyer1, test_buyer2 = split_parties(test, test_max_labels, split)

print(len(train_vendor), len(train_buyer1), len(train_buyer2))
print(len(test_vendor), len(test_buyer1), len(test_buyer2))

173287 6373 14257
9218 494 638


In [24]:
# export data
with open('./Vendor/Test_Vendor', 'wb') as f:
    pickle.dump(test_vendor, f)
with open('./Vendor/Train_Vendor', 'wb') as f:
    pickle.dump(train_vendor, f)


with open('./Buyer1/Test_Buyer1', 'wb') as f:
    pickle.dump(test_buyer1, f)
with open('./Buyer1/Train_Buyer1', 'wb') as f:
    pickle.dump(train_buyer1, f)
    
with open('./Buyer2/Test_Buyer2', 'wb') as f:
    pickle.dump(test_buyer2, f)
with open('./Buyer2/Train_Buyer2', 'wb') as f:
    pickle.dump(train_buyer2, f)