## Part 2 - Federated Learning NLP 
### Split Daily Mail data into topics


In [1]:
import numpy as np
import json
import pickle

import os
#os.chdir("../..")
!!pwd

['/Users/austin.bellibm.com/Documents/FederatedLearning/Part 3 - Applied NLP']

In [9]:
# read in daily mail data - using prepocessed versions
with open('./RawData/data/train.json') as f:
    train = [json.loads(line) for line in f]
    train = np.array(train)
    train_docs, train_labels = zip(*[(line['doc'], line['labels']) for line in train])
    
with open('./RawData/data/test.json') as f:
    test = [json.loads(line) for line in f]
    test = np.array(test)
    test_docs, test_labels = zip(*[(line['doc'], line['labels']) for line in test])
    
# read in nb-svm model
with open('./Models/20ng_topicModel', 'rb') as model:
    clf20 = pickle.load(model)

In [10]:
len(train), len(test)

(4000, 2000)

In [12]:
# The downloaded dataset contains wrongly encoded characters which appear as "â\x80\x98" - I will remove these
train_docs = list(map(lambda train: train.encode('ascii', errors='ignore'), train_docs))
test_docs = list(map(lambda test: test.encode('ascii', errors = 'ignore'), test_docs))

b'with the trademark curly hair and giant personality , it is difficult for jones to blend into the background\nthe uninformed supporter still stops the welsh prop in the street to commiserate with him on his country s latest defeat , unaware that he is no longer in the squad\njones has been a household feature in wales teams over the last decade and is one of the most recognisable and popular figures to wear the dragon red jersey\nthe 33 - year - old has experienced some tough autumn evenings at the millennium stadium , but none quite as difficult as this november , having been left out of the squad for the first time in his career\njonathan davies ( left ) , jones ( centre ) and ian evans sing the welsh national anthem wales coach warren gatland has opted for young prop samson lee over veteran tighhead jones jones ( right ) moved to the cardiff blues from the ospreys during the summer the frontrower admits it has been  tough  and  odd  as an outsider looking in , but has still not gi

Run Topic model on Daily Mail Data (need to do for both train and test - turn into a function to save memory)

In [8]:
def get_labels(clf20, docs, score):
    # get predictions
    preds = clf20.run_nbsvm(docs) # all topics

    # identify max
    max_labels, threshold_vec = clf20.select_max_labels(preds, ret_score_threshold = True, score_threshold = score)

    # convert labels that do not meet threshold to other category
    max_labels = np.where(threshold_vec == 0, len(clf20.topics)-1, max_labels)
    
    return max_labels

train_max_labels = get_labels(clf20, train_docs, .5)
test_max_labels = get_labels(clf20, test_docs, .5)


...Generating TF-IDF Matrices...


KeyboardInterrupt: 

In [None]:
# let's see some of our topics
uniques, counts = np.unique(test_max_labels, return_counts=True)
print(dict(zip(uniques, counts)))

# example of categorized topic
idx13 = np.where(test_max_labels == 13)[0][0]
print(idx13)
print(clf20.topics[13])
print(test_docs[int(idx13)])
print("\n")

idx10 = np.where(test_max_labels == 17)[0][0]
print(clf20.topics[17])
print(test_docs[idx10])

Split and Separate the train and test data into their respective folder

In [None]:
split = {'Vendor':[0,5], # Aggregator will also include Other - 20
         'Buyer1':[6,11],
         'Buyer2':[12,19]}

def split_parties(data, max_labels, split):
    # vendor
    vendor_vec = np.where((max_labels >= split['Vendor'][0]) & (max_labels <= split['Vendor'][1]) | 
                          max_labels == len(clf20.topics)-1) # Other
    vendor = data[vendor_vec]
    
    # buyers
    buyer1_vec = np.where((max_labels >= split['Buyer1'][0]) & (max_labels <= split['Buyer1'][1]))
    buyer1 = data[buyer1_vec]
    
    buyer2_vec = np.where((max_labels >= split['Buyer2'][0]) & (max_labels <= split['Buyer2'][1]))
    buyer2 = data[buyer2_vec]
    
    return vendor, buyer1, buyer2

train_vendor, train_buyer1, train_buyer2 = split_parties(train, train_max_labels, split)
test_vendor, test_buyer1, test_buyer2 = split_parties(test, test_max_labels, split)

print(len(train_vendor), len(train_buyer1), len(train_buyer2))
print(len(test_vendor), len(test_buyer1), len(test_buyer2))

In [None]:
# export data
with open('./Vendor/Test_Vendor', 'wb') as f:
    pickle.dump(test_vendor, f)
with open('./Vendor/Train_Vendor', 'wb') as f:
    pickle.dump(train_vendor, f)


with open('./Buyer1/Test_Buyer1', 'wb') as f:
    pickle.dump(test_buyer1, f)
with open('./Buyer1/Train_Buyer1', 'wb') as f:
    pickle.dump(train_buyer1, f)
    
with open('./Buyer2/Test_Buyer2', 'wb') as f:
    pickle.dump(test_buyer2, f)
with open('./Buyer2/Train_Buyer2', 'wb') as f:
    pickle.dump(train_buyer2, f)