### Please use this notebook inside google colab environment. In Runtime select "Run All", and after the execution, it will show the accuracy result of MVAE-BM in TwitterHate Dataset.

In [None]:
%tensorflow_version 1.x
import tensorflow as tf 
import numpy as np
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1UcsXn-hO7APQpDWBcgCwbFbduUePd6e3',
                                    dest_path='./tweets.zip',
                                    unzip=True)

TensorFlow 1.x selected.


In [None]:

from sklearn.model_selection import train_test_split

train = np.load("tweets.npy",allow_pickle=True)
labels = np.load("labels-tweets.npy",allow_pickle=True)

X_train, X_test, y_train, y_test = train_test_split(train,labels, test_size=0.33, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train, test_size=0.1, random_state=42)

X_train = [' '.join(x) for x in X_train]
X_test = [' '.join(x) for x in X_test]
X_valid = [' '.join(x) for x in X_valid]


In [None]:
#@title Bag of Words Encode
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from random import shuffle

vectorizer = CountVectorizer()

trainX = vectorizer.fit_transform(X_train)
validX = vectorizer.transform(X_valid)
testX = vectorizer.transform(X_test)

voc = vectorizer.vocabulary_


indices = np.arange(trainX.shape[0]) #gets the number of rows 
shuffle(indices)


print('train shape:',trainX.shape,"test shape:",testX.shape)

train shape: (10194, 17915) test shape: (5580, 17915)


In [None]:
cluster = 1 #size of c
h_len = 2000 #size of h
batch_size = 32
c_s = 1000
epochs = 400
eta = 0.00023

In [None]:
#@title MVAE-BM model Build
ep = 1e-20
temp = 0.1


x = tf.placeholder(tf.float32, shape=[None,trainX.shape[1]])
batch = tf.placeholder(tf.int32, shape=())


def encoder_h(x,name="classDecoder"):

        W1 = tf.Variable(tf.random.uniform([trainX.shape[1],1000],0.05,-0.05),name='W1',dtype=tf.float32)
        b1 = tf.Variable(tf.zeros([1000]),name='bias1',dtype=tf.float32)

        layer1 = tf.nn.sigmoid(tf.matmul(x,W1) + b1)

        W2 = tf.Variable(tf.random.uniform([1000,750],0.05,-0.05),name='W2')
        b2 = tf.Variable(tf.zeros([750]),name='bias2',dtype=tf.float32)

        layer2 = tf.nn.sigmoid(tf.matmul(layer1,W2) + b2)

        W3 = tf.Variable(tf.random.uniform([750,h_len],-0.05,0.05),name='WMu')
        b3 = tf.Variable(tf.zeros([h_len]),name='biasMu',dtype=tf.float32)

        W4 = tf.Variable(tf.random.uniform([750,h_len],0,0),name='WSigma')
        b4 = tf.Variable(tf.zeros([h_len]),name='biasSigma',dtype=tf.float32)

        mu = tf.matmul(layer2,W3) + b3
        log_sigma = tf.matmul(layer2,W4) + b4

        return mu,log_sigma

def encoder_c(hs,name="C"):

    with tf.variable_scope(name):
    
        W1 = tf.Variable(tf.random.uniform([trainX.shape[1],1000],-0.05,0.05),name='W1')
        b1 = tf.Variable(tf.zeros([1000]),name='bias1',dtype=tf.float32)

        
        layer1 = tf.nn.tanh(tf.matmul(hs,W1) + b1)

        W2 = tf.Variable(tf.random.uniform([1000,cluster],-0.05,0.05),name='W2')
        b2 = tf.Variable(tf.zeros([cluster]),name='bias2',dtype=tf.float32)

        layer2 = tf.nn.tanh(tf.matmul(layer1,W2) + b2)
        return layer2


def decoder(x,h,name='ClassEncoder'):

     with tf.variable_scope(name):

          R = tf.Variable(tf.random.uniform([h_len,trainX.shape[1]],-0.05,0.05),name='RWord')
          b = tf.Variable(tf.zeros([trainX.shape[1]]),name='biasWord')

          h_R = tf.nn.log_softmax(tf.matmul(h,R)+b)

          un_probs = tf.reduce_sum(tf.multiply(h_R,x),axis=1)

          return un_probs,R,h_R


#h
eps = tf.random_normal((batch,h_len), 0,1)
mu,log_sigma = encoder_h(x)
h = mu+tf.multiply(tf.exp(log_sigma),eps)

#c 
eps2 = tf.random_uniform((batch,cluster), 0,1)
eps2 = -tf.log(-tf.log(eps2+ep))
C = encoder_c(x)
C2 = C+eps2
C2 = tf.nn.softmax(C2/temp,axis=1)


# Doing mixture 
logits = []
word_embedding = []
topic = []

for l in range(0,int(cluster)):

    un_probs,R,h_R = decoder(x,h,name="Class_"+str(l))
    
    logits.append(un_probs)
    word_embedding.append(R)
    topic.append(h_R)

decoder_probs = tf.multiply(logits,tf.transpose(C2))
decoder_probs = tf.reduce_sum(decoder_probs,axis=0)

# Calcule ELBO

kld_c = tf.reduce_sum(tf.nn.softmax(C,axis=1)*tf.log(tf.nn.softmax(C,axis=1)/(1.0/np.float(cluster))),axis=1)
kld_h = -0.5 * tf.reduce_sum(1 - tf.square(mu) + 2 * log_sigma - tf.exp(2 * log_sigma), 1) 

ELBO = decoder_probs - kld_c - kld_h
mean_ELBO = -tf.reduce_mean(ELBO)

optimizer = tf.train.AdamOptimizer(learning_rate=eta)

step = optimizer.minimize(mean_ELBO)

sess = tf.Session()
print('Initializing...')

init = tf.global_variables_initializer()
sess.run(init)


Initializing...


In [None]:
#@title Train MVAE-BM
from IPython.display import HTML, display
import time

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

out = display(progress(0, 100), display_id=True)
    


keep_elbo_ppr = []

for ll in range(0,epochs):

    begin = 0
    end = batch_size

    while end <=trainX.shape[0]:

        select = indices[begin:end]

        feed_dict = {x:trainX[select].toarray().reshape(batch_size,trainX.shape[1]),batch:batch_size}
        sess.run(step,feed_dict=feed_dict)

        begin = end
        end += batch_size
            
    #print ("### EPOCH",ll," #####")
    out.update(progress(ll, epochs))
    feed_dict = {x:validX.toarray(),batch:validX.shape[0]}
    elbo = sess.run(ELBO,feed_dict=feed_dict)
    
    # clean memory, importat for large dataset
    feed_dict = {}
    del feed_dict
    feed_dict = 2

    keep_elbo_ppr.append([np.mean(elbo),0])

    # clean memory, importat for large dataset
    #del ppr
    #del val_matrix

    val_matrix = 0
    ppr = 0

In [None]:
#@title MVAE-BM Encode Train and Test
feed_dict = {x:trainX.toarray(),batch:trainX.shape[0]}
c_train,h_train,mu_train = sess.run([C2,h,mu],feed_dict=feed_dict)

feed_dict = {x:testX.toarray(),batch:testX.shape[0]}
c_test,h_test,mu_test = sess.run([C2,h,mu],feed_dict=feed_dict)

    
feed_dict = {x:testX[0:1].toarray(),batch:1}
words_embedding = sess.run(word_embedding)

r_train = []
cluster = np.argmax(c_train,axis=1)

for sparse,c_cluster,mus in zip(trainX,cluster,mu_train):
        
        s = vectorizer.inverse_transform(sparse.toarray())[0]
        aux = np.zeros(h_len)

        for word in s:
            aux += words_embedding[c_cluster][:,voc[word]]

        if len(s) > 0:
            rs = aux/np.float(len(s))

        r_train.append(rs)

r_test = []
cluster = np.argmax(c_test,axis=1)

for sparse,c_cluster,mus in zip(testX,cluster,mu_test):

        s = vectorizer.inverse_transform(sparse.toarray())[0]
        aux = np.zeros(h_len)

        for word in s:
            aux += words_embedding[c_cluster][:,voc[word]]

        if len(s) > 0:

            rs = aux/np.float(len(s))
 
        r_test.append(rs)
 
r_train = np.array(r_train)
r_test = np.array(r_test)

In [None]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
    

clf = LR().fit(mu_train,y_train);
h_preds = clf.predict(mu_test);

clf = LR().fit(r_train,y_train);
r_preds = clf.predict(r_test);


#clf = LR().fit(r_train2,y_train);
#r_preds2 = clf.predict(r_test2);

print ('Accuracy for H Representation:',accuracy_score(y_test,h_preds))
print ('Accuracy for merge R Representation:',accuracy_score(y_test,r_preds))
#print ('Accuracy for merge R Representation:',accuracy_score(y_test,r_preds2))
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy for H Representation: 0.8125448028673835
Accuracy for merge R Representation: 0.8130824372759856


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
