In [74]:
import numpy as np
from numpy.random import seed
import tensorflow as tf
import re


from tensorflow import set_random_seed
seed(42)
set_random_seed(42)

In [2]:
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 100

N_HIDDEN_HL1 = 10
RANDOM_STATE = 42

seed(RANDOM_STATE)
set_random_seed(RANDOM_STATE)

In [3]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [4]:
X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

### Preprocess

In [5]:
# Truncate the vector by len = 80
# k = 40

word_list = []
connotation = {}
path = r'./imdb-unigrams.txt'

with open(path, 'r', encoding='utf8') as f:
    for line in f:
        word_list.append(line.strip())


In [6]:
len(word_list)

83

In [7]:
len(X_train_original)

25000

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

token_pattern = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 5, token_pattern=token_pattern, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [9]:
# make sure all the 'human-term' exists

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

words = intersection(cv.get_feature_names(), word_list)
len(words)

83

In [10]:
cv_ht = CountVectorizer(token_pattern=token_pattern, vocabulary=word_list)
X_train_ht = cv.fit_transform(X_train_original)

### Placeholder here

In [11]:
# calculate log-ratio

'''
Count the negative and positive frequency
'''
def negative_positive_counts(X, y, word_index):
    neg_count = np.sum(X[y==0, word_index])
    pos_count = np.sum(X[y==1, word_index])    
    return neg_count, pos_count

'''
Count the ratio : log(#pos/#neg)
'''
def log_ratio_positive_negative(X, y, word_index):
    neg_count, pos_count = negative_positive_counts(X,y, word_index)
    log_ratio = np.log(pos_count+1)-np.log(neg_count+1)
    return log_ratio, neg_count, pos_count

'''
Sort top words w.r.t log ratio and write into file
'''
def sort_top_words_with_count(X, y, words,filename, top_k=10):
    log_ratio = []
    neg_count = []
    pos_count = []
    
    for i in range(0,len(words)):
        log_ratio_, neg_count_, pos_count_ = log_ratio_positive_negative(X, y, i)
        log_ratio.append(log_ratio_)
        neg_count.append(neg_count_)
        pos_count.append(pos_count_)
    
    sorted_indices_descending_abs = np.argsort(np.absolute(log_ratio))[::-1]
    
    filename = filename + '.txt'
    with open(filename, mode='w', encoding='utf8') as w:
        for i in sorted_indices_descending_abs[: top_k]:
#             print("%s\t%0.2f" %(words[i], weights[i]))
#             n_p=negative_positive_counts(X, y, i)
            w.write("%s\t%0.2f\t%d\t%d" %(str(words[i]), log_ratio[i], pos_count[i], neg_count[i]))
            w.write('\n')
        w.close()

In [12]:
sort_top_words_with_count(X_train_ht, y_train_original, word_list, 'human-terms-log-ratio', top_k=len(words))

In [13]:
def load_list(filename, split_delimiter):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip().split(split_delimiter))
    return np.asarray(vocabulary)

log_ratio_list = load_list('human-terms-log-ratio.txt', '\t')

In [14]:
log_ratio_list

array([['2/10', '-3.87', '0', '47'],
       ['annoying', '-3.15', '9', '233'],
       ['badly', '-2.30', '0', '9'],
       ['best', '2.27', '328', '33'],
       ['funny', '1.99', '21', '2'],
       ['solid', '1.95', '6', '0'],
       ['waste', '1.95', '13', '1'],
       ['fantastic', '1.79', '5', '0'],
       ['awful', '-1.54', '2', '13'],
       ['subtle', '1.39', '27', '6'],
       ['8/10', '1.39', '7', '1'],
       ['obnoxious', '-1.25', '1', '6'],
       ['wasted', '1.25', '6', '1'],
       ['worse', '1.25', '6', '1'],
       ['1/10', '-1.15', '36', '116'],
       ['insult', '-1.05', '20', '59'],
       ['worst', '0.98', '7', '2'],
       ['6/10', '0.92', '14', '5'],
       ['7/10', '0.92', '4', '1'],
       ['brilliant', '-0.92', '1', '4'],
       ['forgettable', '-0.92', '1', '4'],
       ['refreshing', '0.92', '4', '1'],
       ['10/10', '0.92', '4', '1'],
       ['disappointing', '0.92', '4', '1'],
       ['unfortunately', '-0.92', '1', '4'],
       ['5/10', '0.88', '11', '4'],

In [15]:
X_tr = np.expand_dims(X_train, axis=0)
X_te = np.expand_dims(X_test, axis=0)

In [16]:
y_tr = np.reshape(y_train_original, (len(y_train_original), 1))
y_te = np.reshape(y_test_original, (len(y_test_original), 1))

In [17]:
y_ht_tr = np.zeros([X_train.shape[0], X_train.shape[1]])
y_ht_te = np.zeros([X_test.shape[0], X_train.shape[1]])

In [18]:
y_ht_tr.shape

(25000, 26266)

In [19]:
# maybe get intermediate layer of tanh h from Mitchell code.

def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

In [20]:
# get function index

vocabulary_ = cv.get_feature_names()
vocab_index = {}

for i,ht in enumerate(word_list):
    for j, voc in enumerate(vocabulary_):
        if voc == ht:
            vocab_index[ht] = j
            
inv_voc = {v: k for k, v in vocab_index.items()}

In [21]:
vocab_index

{'1/10': 11,
 '2/10': 185,
 '3/10': 225,
 '4/10': 250,
 '5/10': 268,
 '6/10': 285,
 '7/10': 302,
 '8/10': 320,
 '9/10': 338,
 '10/10': 18,
 'amazing': 1065,
 'annoying': 1262,
 'avoid': 1858,
 'awful': 1890,
 'bad': 1976,
 'badly': 1984,
 'beautiful': 2276,
 'beautifully': 2277,
 'best': 2474,
 'bland': 2656,
 'boring': 2942,
 'brilliant': 3096,
 'cheap': 3956,
 'disappointed': 6575,
 'disappointing': 6576,
 'disappointment': 6579,
 'dreadful': 7107,
 'dull': 7239,
 'enjoyable': 7836,
 'enjoyed': 7838,
 'excellent': 8205,
 'fails': 8481,
 'fantastic': 8551,
 'fascinating': 8589,
 'favorite': 8638,
 'forgettable': 9221,
 'fun': 9496,
 'funny': 9518,
 'funniest': 9516,
 'gem': 9709,
 'great': 10202,
 'horrible': 11280,
 'incredible': 11863,
 'insult': 12168,
 'lacks': 13259,
 'lame': 13292,
 'laughable': 13399,
 'lousy': 13996,
 'loved': 14007,
 'mediocre': 14778,
 'mess': 14905,
 'mst3k': 15498,
 'noir': 16018,
 'obnoxious': 16251,
 'pathetic': 17057,
 'perfect': 17246,
 'perfectly': 17

In [22]:
len(vocab_index)

83

In [23]:
def generate_appearance(X_train, X_test, word_index, connotation):
    y_train_agreement = []
    for i in range(X_train.shape[0]):
        doc_agreement = []
        for w,j in word_index.items():
            if X_train[i,j] == 1:
                if connotation[w] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(X_test.shape[0]):
        doc_agreement = []
        for w,j in word_index.items():
            if X_test[i,j] == 1:
                if connotation[w] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

In [24]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [25]:
y_train_agreement, y_test_agreement = generate_appearance(X_train, X_test, word_index=vocab_index, connotation=connotation)

In [42]:
pos_ratio = {}
for i in range(len(log_ratio_list)):
    pos_ratio[log_ratio_list[i, 0]] = float(log_ratio_list[i, 1])

In [78]:
def get_ht_sum(y_agreement):
    tr_ge2 = np.zeros(y_agreement.shape[0])

    #X_reject_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)==0))
    X_ge2_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)>1))
    X_1_indices = np.squeeze(np.where(np.sum(np.absolute(y_agreement), axis=1)==1))

    tr_ge2[X_ge2_indices] = 2
    tr_ge2[X_1_indices] = 1
    
    return tr_ge2

def where_sample_ht_index(y_agreement, pos_ratio, word_list):
    
    ht_ge_one = get_ht_sum(y_agreement)
    
    ht_sample_index = []
    
    for i,y in enumerate(ht_ge_one):
        if y == 2:
            indices = np.squeeze(np.where(y_agreement[i, :] != 0))

            list_ = []
            for j in indices:
                try:
                    list_.append(pos_ratio[word_list[j]])
                except KeyError:
                    continue

            list_ = np.asarray(np.absolute(list_))
            ht_sample_index.append(indices[np.argmax(list_)])
        elif y == 1:
            ht_sample_index.append(np.where(y_agreement[i,:] != 0)[0][0])
        else:
            ht_sample_index.append(-1)
            
    return np.asarray(ht_sample_index)

In [79]:
def generate_sequence_sample(X_, y_agreement, pos_ratio, word_list, token_pattern=r"(?u)\b[\w\'/]+\b", input_seq_k=40, output_seq_t=2):
    # get which human terms to be extracted
    
    ht_sample_index = where_sample_ht_index(y_agreement, 
                                            pos_ratio, 
                                            word_list)
    X_sample = []
    y_target = []

    for idx, doc in enumerate(X_):
        if ht_sample_index[idx] == -1:
    #         X_sample.append('--NONE--')
    #         y_target.append('--NONE--')
    # discard doc with no human-terms
            continue

        join = '  '
        target = '  ' 
        token = re.findall(token_pattern, doc)

        for i,tok in enumerate(token):

            if tok==word_list[ht_sample_index[idx]]:
                # check if the length of document less than k
                # then just use the whole document
                if len(token) < input_seq_k and len(token)>(2*output_seq_t+1):
                    join = ' '.join(token)
                    target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])
                    break

                # less than k, less than EOF
                elif i < input_seq_k-1 and i<len(token)-1-input_seq_k:
                    join = ' '.join(token[:i+input_seq_k+1])

                    # define target
                    if output_seq_t > i:
                        target = ' '.join(token[:i+output_seq_t+1])
                    else:
                        target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])

                    break
                # more than k, more than EOF
                elif i>input_seq_k-1 and i>=len(token)-1-input_seq_k:
                    join = ' '.join(token[i-input_seq_k:])

                    #define target
                    if output_seq_t >= len(token)-1-output_seq_t:
                        target = ' '.join(token[i-output_seq_t:])

                    else:
                        target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])

                    break
                else:
                    join = ' '.join(token[i-input_seq_k:i+input_seq_k+1])
                    target = ' '.join(token[i-output_seq_t:i+output_seq_t+1])
                    break

        X_sample.append(join)
        y_target.append(target)
    
    return X_sample, y_target

In [80]:
X_tr_sample, y_tr_target = generate_sequence_sample(X_train_original, y_train_agreement, pos_ratio, word_list)

In [81]:
X_te_sample, y_te_target = generate_sequence_sample(X_test_original, y_test_agreement, pos_ratio, word_list)

In [82]:
len(X_tr_sample)

22752

In [83]:
len(X_te_sample)

22701

### Sample generated
#### Test on Seq2Seq architecture. 

Implement first in Keras <br>
Preprocess the sequence using one-hot representation (omit the embedding for this stage) <br> <br>

train, val, test : 25%, 25%, 50% <br> <br>

<b>DO NOT MODIFY TEST SAMPLES</b>

In [86]:
# vectorize the data

# https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/

# count number of words with sets
# or simply use the tokenizer update in Keras

from keras.preprocessing.text import text_to_word_sequence, one_hot

test_result = text_to_word_sequence(X_tr_sample[0])

train_sequence = []
for sample in X_tr_sample:
    train_sequence.append(text_to_word_sequence(sample))
    

AttributeError: 'list' object has no attribute 'lower'

In [85]:
test_result

['even',
 'kind',
 'of',
 'a',
 'happy',
 'ending',
 'of',
 'sort',
 'whee',
 'a',
 'step',
 'up',
 'from',
 'part',
 '4',
 'but',
 'not',
 'much',
 'of',
 'one',
 'again',
 'brian',
 'yuzna',
 'is',
 'involved',
 'and',
 'screaming',
 'mad',
 'george',
 'so',
 'some',
 'decent',
 'special',
 'effect',
 'but',
 'not',
 'enough',
 'to',
 'make',
 'this',
 'great',
 'a',
 'few',
 'leftover',
 'from',
 'part',
 '4',
 'are',
 'hanging',
 'around',
 'too',
 'like',
 'clint',
 'howard',
 'and',
 'neith',
 'hunter',
 'but',
 'that',
 'does',
 'not',
 'really',
 'make',
 'any',
 'difference',
 'anyway',
 'i',
 'now',
 'have',
 'seeing',
 'the',
 'whole',
 'series',
 'out',
 'of',
 'my',
 'system',
 'now',
 'if',
 'i',
 'could']