In [1]:
from keras.layers import Input, Dense, TimeDistributed, Embedding
from keras.layers import Concatenate, Reshape, Lambda, Multiply, multiply, concatenate
from keras.models import Model
from keras import backend as K

import os

import tensorflow as tf
import numpy as np

# load data
# make sure that the first shape is the IMDB training data. 

def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

# 'imdb-unigrams.txt'

####################################################################

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

######################################################################



Using TensorFlow backend.


In [2]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [3]:
words = cv.get_feature_names()

In [9]:
np.where(words == word_list[0])

AttributeError: 'tuple' object has no attribute 'shape'

In [8]:
for word in words:
    if word == word_list[0]:
        print(word)

1/10


In [12]:
words = np.asarray(words)

In [13]:
words.shape

(3686,)

In [14]:
word_list = np.asarray(word_list)

In [15]:
word_list.shape

(83,)

In [17]:
np.where((words == word_list[0]) == True)

(array([3], dtype=int64),)

In [73]:
term_list = dict()

In [74]:
for word in word_list:
    if len(np.where((words==word)==True)[0]) == 1:
        term_list[word] = np.where((words==word)==True)[0][0]
    else:
        term_list[word] = None

In [88]:
X_train[:,term_list['1/10']]

<25000x1 sparse matrix of type '<class 'numpy.int64'>'
	with 242 stored elements in Compressed Sparse Row format>

In [None]:
def load_unigrams(path, X, y, cv):
    human_term_list = []
    connotation = {}
    term_index = dict()
    vocab = np.asarray(cv.get_feature_names())
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            human_term_list.append(line.strip())
    
    human_term_list = np.asarray(human_term_list)
    
    for word in human_term_list:
        if len(np.where((vocab==word)==True)[0]) == 1:
            term_index[word] = np.where((words==word)==True)[0][0]
        else:
            term_index[word] = None
            connotation[word] = None
            continue
            
        pos_count = 0
        neg_count = 0
        
        for i,doc in enumerate(X):
            if doc[0,term_index[word]] == 1:
                if y[i] == 1:
                    pos_count += 1
                else:
                    neg_count += 1
        
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return human_term_list, connotation

In [None]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train, y_train_original, cv)