In [None]:
# These codes are based on this github (https://github.com/roeeaharoni/unsupervised-domain-clusters),
# where the paper is called "Unsupervised Domain Clusters in Pretrained Language Models". (https://arxiv.org/pdf/2004.02105.pdf)

# Creating pusedo-labels in unsupervised manners

This notebook enables to perform the following steps:

- Read text files that represent different users

- Encode each line in the text files with a pretrained model (e.g., RoBERTa)

- Cluster and visualize the encodings (using a GMM)

- Save pusedo-labels

In [None]:
# HF version 3.0.2
# PT version 1.6.0
import torch
import time
from transformers import *
import numpy as np
from collections import Counter

MODELS = [
        (RobertaModel, RobertaTokenizer, 'roberta-base'),
        (RobertaModel, RobertaTokenizer, 'roberta-large'),
        (BertModel, BertTokenizer, 'bert-base-uncased'),
        (BertModel, BertTokenizer, 'bert-large-cased'),
        (BertModel, BertTokenizer, 'bert-base-multilingual-cased'),
        (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
        (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
        (GPT2Model, GPT2Tokenizer, 'gpt2'),
        (CTRLModel,       CTRLTokenizer,       'ctrl'),
        (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
        (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
        (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
        (BertModel, BertTokenizer, 'bert-base-multilingual-cased'),
        (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
        (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large')
    ]


def encode_with_transformers(corpus, models_to_use = ['roberta-large']):
    """
    Encodes the corpus using the models in models_to_use. 
    Returns a dictionary from a model name to a list of the encoded sentences and their encodings.
    The encodings are calculatd by average-pooling the last hidden states for each token. 
    """
    model_to_states = {}
    for model_class, tokenizer_class, model_name in MODELS:
        if model_name not in models_to_use:
            continue
        print('encoding with {}...'.format(model_name))
        model_to_states[model_name] = {}
        model_to_states[model_name]['states'] = []
        model_to_states[model_name]['sents'] = []

        # Load pretrained model/tokenizer
        tokenizer = tokenizer_class.from_pretrained(model_name)
        model = model_class.from_pretrained(model_name)
        model.to(torch.device('cuda'))
        
        # Encode text
        start = time.time()
        for sentence in corpus:
            model_to_states[model_name]['sents'].append(sentence)
            input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True, max_length=300)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
            input_ids = input_ids.to(torch.device('cuda'))
            
            with torch.no_grad():
                output = model(input_ids)
                last_hidden_states = output[0]
                
                # avg pool last hidden layer
                squeezed = last_hidden_states.squeeze(dim=0)
                masked = squeezed[:input_ids.shape[1],:]
                avg_pooled = masked.mean(dim=0)                
                model_to_states[model_name]['states'].append(avg_pooled.cpu())
                
        end = time.time()
        print('encoded with {} in {} seconds'.format(model_name, end - start))
        np_tensors = [np.array(tensor) for tensor in model_to_states[model_name]['states']]
        model_to_states[model_name]['states'] = np.stack(np_tensors)
    return model_to_states

## Plot GMM Clusters

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import gensim

from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

print(__doc__)
hsv = plt.get_cmap('hsv')
colors = ['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink', 'brown', 'gray', 'black', ]

import pdb
def fit_gmm(name_to_embeddings, class_names, first_principal_component_shown=0, 
            last_principal_component_shown=1, clusters=5, header='', plot=True, pca=True, examples_per_class = 388):
    """
    Fits a GMM to the embeddings in name_to_embeddings where each name represents a dataset.
    """
    #pdb.set_trace()
    gmm_colors = hsv(np.linspace(0, 1.0, clusters))
    colors = hsv(np.linspace(0, 1.0, clusters))
    
    all_states = []
    all_sents = []
    num_classes = len(class_names) # number of domains (e.g., users)
    
    if last_principal_component_shown <= first_principal_component_shown:
        raise Exception('first PCA component must be smaller than the 2nd')
    
    # Concatenate the data to one matrix
    for label in class_names:
        # all_states.append(name_to_embeddings[label]['states'][0:examples_per_class])
        all_states.append(name_to_embeddings[label]['states'][:])
        all_sents += name_to_embeddings[label]['sents']
    concat_all_embs = np.concatenate(all_states)
    
    # Compute PCA
    if pca:
        pca = PCA(n_components=1+last_principal_component_shown)
        pca_data = pca.fit_transform(concat_all_embs)[:, list(range(first_principal_component_shown,last_principal_component_shown+1))]
    else:
        pca_data = concat_all_embs
    
    #pdb.set_trace()
    
    pca_labels = []
    for i, label in enumerate(class_names): # number of domains
        for j in range(len(name_to_embeddings[label]['sents'])): # number of sentences per domain (e.g., # of sentences per users)
            pca_labels.append(i)
    pca_labels = np.array(pca_labels)
    # pdb.set_trace()

    # Do not split the data - train=test=all (unsupervised evaluation) 
    train_index = list(range(0, pca_data.shape[0]))
    test_index = list(range(0, pca_data.shape[0]))
    
    # TODO: why train and test?
    X_train = pca_data[train_index]
    y_train = pca_labels[train_index]
    X_test = pca_data[test_index]
    y_test = pca_labels[test_index]

    n_classes = len(np.unique(y_train))
    if clusters > 0: # number of clusters passed by arguments
        n_clusters = clusters
    else:
        n_clusters = n_classes
    
    # Can try GMMs using different types of covariances, we use full.
    estimators = {cov_type: GaussianMixture(n_components=n_clusters,
                  covariance_type=cov_type, max_iter=150, random_state=0)
                  for cov_type in ['full']} #'spherical', 'diag', 'tied', 

    n_estimators = len(estimators)

    # Configure the plot
    if plot:
        main_plot = plt.figure(figsize=(8, 8))
        plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99)

    best_accuracy = 0
    for index, (name, estimator) in enumerate(estimators.items()):
        
        # train the GMM         
        estimator.fit(X_train)
        
        # create the plots
        if plot == True:
            h = plt.subplot(1, 1, 1)

            # Plot the train data with dots
            for n, color in enumerate(colors[:num_classes]):
                temp = []
                for index in range(len(pca_labels)):
                    if pca_labels[index] == n:
                        temp.append(index)
                #pdb.set_trace()
                data = pca_data[temp]
                
                plt.scatter(data[:, 0], data[:, 1], s=20, marker='o', color=color, 
                            label=class_names[n].replace('_dev',''), alpha = 0.3)
                
                
        # predict the cluster ids for train         
        y_train_pred = estimator.predict(X_train)
        
        # predict the cluster ids for test
        y_test_pred = estimator.predict(X_test)
        
        ########################################
        # create the plots
        if plot == True:
            main_plot = plt.figure(figsize=(8, 8))
            plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99)
            h = plt.subplot(1, 1, 1)

            # Plot the train data with dots
            for n, color in enumerate(gmm_colors[:]):
                temp = []
                for index, label in enumerate(y_train_pred):
                    if label == n:
                        temp.append(index)
                # pdb.set_trace()
                data = pca_data[temp]
                
                plt.scatter(data[:, 0], data[:, 1], s=20, marker='o', color=color, 
                            alpha = 0.3)
        ########################################

    if plot:        
        plt.suptitle(header)
        plt.show()

    return y_train_pred

## Encode the each user

In [None]:
from collections import defaultdict
from collections import Counter


#EX)
#/home/users/user_109/train.src.tok
#/home/users/user_109/train.mt.tok
#/home/users/user_109/train.pe.tok
#/home/users/user_109/valid.*
#/home/users/user_109/test.*

###############
#CHANGE PATH
###############
base_path_new = 'PATH_FOR_USERS'

users = ['user_109', 'user_129', 'user_150', 'user_159', 'user_29', 'user_50', 'user_76', 'user_95', 'user_120', 'user_130', 'user_151', 'user_160', 'user_34', 'user_56', 'user_81', 'user_97', 'user_121', 'user_144', 'user_152', 'user_18', 'user_43', 'user_59', 'user_84', 'user_122', 'user_148', 'user_156', 'user_19', 'user_44', 'user_64', 'user_94']

file_paths_new = {}

for user in users:
    file_paths_new[user] = base_path_new + '/' + user + "/train.src.tok"

models_to_use = ['roberta-large']

model_to_domain_to_encodings_new = defaultdict(dict)
for domain_name in file_paths_new:
    print('encoding {} with transformers...'.format(domain_name))
    file_path = file_paths_new[domain_name]
    counter = Counter(open(file_path).readlines())
    print(counter.most_common(10))
    lines = open(file_path).readlines()
    print('found {} lines'.format(len(lines)))
    res = encode_with_transformers(lines, models_to_use)
    for model_name in models_to_use:
        model_to_domain_to_encodings_new[model_name][domain_name] = res[model_name]

In [None]:
domains = users
first_principal = 1
last_principal = 50
num_clusters = 10
num_experiments = 1
use_pca = True

model_to_accuracies = defaultdict(list)
for i in range(num_experiments):
    if i == num_experiments - 1:
        plot = True
    else:
        plot = False
    
    for model_name in model_to_domain_to_encodings_new:
        y_train_pred = fit_gmm(model_to_domain_to_encodings_new[model_name], domains, 
                                first_principal_component_shown = first_principal,
                                last_principal_component_shown = last_principal, 
                                clusters = num_clusters,
                                header = model_name, plot=plot, pca = use_pca, examples_per_class=388)

# Save pusedo-labels

In [None]:
sent_by_domains = {}
accumulate_sent = 0
num_sent = 0
for domain_name in file_paths_new:
    file_path = file_paths_new[domain_name]
    lines = open(file_path).readlines()
    accumulate_sent += num_sent
    num_sent = len(lines)
    with open(base_path_new+'/'+domain_name+'/train_domain.10', 'w') as f:
        for label in y_train_pred[accumulate_sent:accumulate_sent+num_sent]:
            f.write(str(label)+'\n')

In [None]:
for user, domain_name in enumerate(file_paths_new):
    lines = open(base_path_new+'/'+domain_name+'/train.src.tok').readlines()  
    accumulate_sent += num_sent
    num_sent = len(lines)
    with open(base_path_new+'/'+domain_name+'/train.USER', 'w') as f:
        for label in range(num_sent):
            f.write(str(user)+'\n')

In [None]:
for user, domain_name in enumerate(file_paths_new):
    lines = open(base_path_new+'/'+domain_name+'/valid.src.tok').readlines()   
    accumulate_sent += num_sent  
    num_sent = len(lines)
    with open(base_path_new+'/'+domain_name+'/valid.USER', 'w') as f:
        for label in range(num_sent):
            f.write(str(user)+'\n')

In [None]:
for user, domain_name in enumerate(file_paths_new):
    lines = open(base_path_new+'/'+domain_name+'/test.src.tok').readlines()  
    accumulate_sent += num_sent
    num_sent = len(lines)
    with open(base_path_new+'/'+domain_name+'/test.USER', 'w') as f:
        for label in range(num_sent):
            f.write(str(user)+'\n')

# Data shuffle and save for train, valid, and test

In [None]:
from sklearn.utils import shuffle
import pandas as pd

In [None]:
import unicodedata

In [None]:
train_df = []
for user, domain_name in enumerate(file_paths_new):
    temp = {}
    for type_ in ['src','mt','pe']:
        lines = open(base_path_new+'/'+domain_name+'/train.' + type_ + '.tok').readlines()
        temp[type_] = lines
    lines = open(base_path_new+'/'+domain_name+'/train_domain.10').readlines()
    temp['domain_15'] = lines
    lines = open(base_path_new+'/'+domain_name+'/train.USER').readlines()
    temp['USER'] = lines
    train_df.append(temp)

In [None]:
valid_df = []
for user, domain_name in enumerate(file_paths_new):
    temp = {}
    for type_ in ['src','mt','pe']:
        lines = open(base_path_new+'/'+domain_name+'/valid.' + type_ + '.tok').readlines()
        temp[type_] = lines
    lines = open(base_path_new+'/'+domain_name+'/valid.USER').readlines()
    temp['USER'] = lines
    valid_df.append(temp)

In [None]:
test_df = []
for user, domain_name in enumerate(file_paths_new):
    temp = {}
    for type_ in ['src','mt','pe']:
        lines = open(base_path_new+'/'+domain_name+'/test.' + type_ + '.tok').readlines()
        temp[type_] = lines
    lines = open(base_path_new+'/'+domain_name+'/test.USER').readlines()
    temp['USER'] = lines
    test_df.append(temp)

In [None]:
train_src = [user['src'] for user in train_df]
train_mt = [user['mt'] for user in train_df]
train_pe = [user['pe'] for user in train_df]
train_10 = [user['domain_10'] for user in train_df]
train_user = [user['USER'] for user in train_df]

In [None]:
valid_src = [user['src'] for user in valid_df]
valid_mt = [user['mt'] for user in valid_df]
valid_pe = [user['pe'] for user in valid_df]
valid_user = [user['USER'] for user in valid_df]

In [None]:
test_src = [user['src'] for user in test_df]
test_mt = [user['mt'] for user in test_df]
test_pe = [user['pe'] for user in test_df]
test_user = [user['USER'] for user in test_df]

In [None]:
def create_dataframe(data):
    #dataframe
    a = []
    for sublist in data:
        a.append(pd.DataFrame(sublist))

    b = pd.DataFrame()
    for item in a:
         b= pd.concat([b, item])
    return b.reset_index().drop(columns=['index'])

In [None]:
def train_shuffle_and_clean(src,mt,pe,_10,USER):
    shuffle_ = pd.concat([src, mt, pe, _10, USER], axis=1)
    shuffle_ = shuffle(shuffle_)
    shuffle_.columns=['src','mt','pe', '15', 'user']
    shuffle_1 = shuffle_[['src']]
    shuffle_2 = shuffle_[['mt']]
    shuffle_3 = shuffle_[['pe']]
    shuffle_4 = shuffle_[['10']]
    shuffle_5 = shuffle_[['user']]
    return shuffle_1, shuffle_2, shuffle_3, shuffle_4, shuffle_5

In [None]:
def shuffle_and_clean(src,mt,pe,USER):
    shuffle_ = pd.concat([src, mt, pe, USER], axis=1)
    shuffle_ = shuffle(shuffle_)
    shuffle_.columns=['src','mt','pe','user']
    shuffle_1 = shuffle_[['src']]
    shuffle_2 = shuffle_[['mt']]
    shuffle_3 = shuffle_[['pe']]
    shuffle_4 = shuffle_[['user']]
    return shuffle_1, shuffle_2, shuffle_3, shuffle_4

In [None]:
train_src = create_dataframe(train_src)
train_mt = create_dataframe(train_mt)
train_pe = create_dataframe(train_pe)
train_10 = create_dataframe(train_10)
train_user = create_dataframe(train_user)

In [None]:
valid_src = create_dataframe(valid_src)
valid_mt = create_dataframe(valid_mt)
valid_pe = create_dataframe(valid_pe)
valid_user = create_dataframe(valid_user)

In [None]:
test_src = create_dataframe(test_src)
test_mt = create_dataframe(test_mt)
test_pe = create_dataframe(test_pe)
test_user = create_dataframe(test_user)

In [None]:
shuffle_train_src, shuffle_train_mt, shuffle_train_pe, shuffle_train_10, shuffle_train_user = train_shuffle_and_clean(train_src, train_mt, train_pe, train_10, train_user)

In [None]:
shuffle_valid_src, shuffle_valid_mt, shuffle_valid_pe, shuffle_valid_user = shuffle_and_clean(valid_src, valid_mt, valid_pe, valid_user)

In [None]:
shuffle_test_src, shuffle_test_mt, shuffle_test_pe, shuffle_test_user = shuffle_and_clean(test_src, test_mt, test_pe, test_user)

In [None]:
def save_data(data, path):
    with open(path, 'w') as f:
        for user in data.values:
            for sample in user:
                sample = unicodedata.normalize('NFC',sample).encode().decode()
                f.write("%s" % sample)

In [None]:
save_path = 'WHERE_TO_SAVE'

In [None]:
save_data(shuffle_train_src, save_path + 'train.src')
save_data(shuffle_train_mt, save_path + 'train.mt.tok')
save_data(shuffle_train_pe, save_path + 'train.pe.tok')
save_data(shuffle_train_10, save_path + 'train.10')
save_data(shuffle_train_user, save_path + 'train.USER')

In [None]:
save_data(shuffle_valid_src, save_path + 'valid.src')
save_data(shuffle_valid_mt, save_path + 'valid.mt.tok')
save_data(shuffle_valid_pe, save_path + 'valid.pe.tok')
######### 
#Altough we don't use pusedo-labels during validation and test phase, 
#we need these for dummy data so that we can simplify the codes.
save_data(shuffle_valid_user, save_path + 'valid.10')
#########
save_data(shuffle_valid_user, save_path + 'valid.USER')

In [None]:
save_data(shuffle_test_src, save_path + 'test.src')
save_data(shuffle_test_mt, save_path + 'test.mt.tok')
save_data(shuffle_test_pe, save_path + 'test.pe.tok')
######### 
#Altough we don't use pusedo-labels during validation and test phase, 
#we need these for dummy data so that we can simplify the codes.
save_data(shuffle_valid_user, save_path + 'test.10')
#########
save_data(shuffle_test_user, save_path + 'test.USER')