# Polarity Classification

Movie Review Polarity Dataset (review polarity.tar.gz, 3MB). <br>
http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

# 1. CNN with Embedding Layer

## Make Vocabulary Set

In [1]:
from nltk.corpus import stopwords
import string
import re

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read() # read all
    file.close()
    return text

def clean_doc_for_voca(doc):
    tokens = doc.split()
    
    # remove punctuations
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    
    # remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    
    # remove stop-words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    
    # remove non-freq words (주의: document범위내에서만으로 한정)
    tokens = [word for word in tokens if len(word) > 1]
    
    return tokens

### Explore a single example and cleaned one

In [2]:
# load the document
filename = 'txt_sentoken/pos/cv132_5618.txt'
text = load_doc(filename)
print(text)

quaid stars as a man who has taken up the proffesion of dragonslayer after he feels he is betrayed by a dragon early in the movie . 
he runs into the last dragon in existence , and there is a genuinely amusing battle between the two which results in a standoff where quaid is in the dragons mouth , but has his sword pointed at the dragons brain . 
eventually , they decide to call a truce , and they work out a deal . 
since he is the last dragon , he will pretend to die and quaid will be able to get paid for it . 
their scam works at first , until they come to a town without any money . 
instead the town sacrifices a girl to the dragon , but of course , draco is a nice droagon , so he won't eat her . there is however a very amusing scene where draco is hitting on the young girl . 
of course , as you can probably tell by the plot , this is a silly movie , but it does know when to take itself seriously at the right times , unlike eddie , which was serious all the time . 
you could probably

In [3]:
tokens = clean_doc_for_voca(text)
print(tokens)

['quaid', 'stars', 'man', 'taken', 'proffesion', 'dragonslayer', 'feels', 'betrayed', 'dragon', 'early', 'movie', 'runs', 'last', 'dragon', 'existence', 'genuinely', 'amusing', 'battle', 'two', 'results', 'standoff', 'quaid', 'dragons', 'mouth', 'sword', 'pointed', 'dragons', 'brain', 'eventually', 'decide', 'call', 'truce', 'work', 'deal', 'since', 'last', 'dragon', 'pretend', 'die', 'quaid', 'able', 'get', 'paid', 'scam', 'works', 'first', 'come', 'town', 'without', 'money', 'instead', 'town', 'sacrifices', 'girl', 'dragon', 'course', 'draco', 'nice', 'droagon', 'wont', 'eat', 'however', 'amusing', 'scene', 'draco', 'hitting', 'young', 'girl', 'course', 'probably', 'tell', 'plot', 'silly', 'movie', 'know', 'take', 'seriously', 'right', 'times', 'unlike', 'eddie', 'serious', 'time', 'could', 'probably', 'call', 'medieval', 'comedy', 'laughs', 'eddie', 'spy', 'hard', 'combined', 'dennis', 'quaid', 'makes', 'fine', 'hero', 'pete', 'posthlewaite', 'provides', 'ghreat', 'comedy', 'monk', 

In [4]:
from os import listdir

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc_for_voca(doc)
    vocab.update(tokens) # vocab이 Counter()이므로.. (주의) Counter의 입력은 list형태이다.
    
def process_docs_for_voca(directory, vocab): # 데이터 파일이 폴더 안에 여러개가 있으므로...
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path, vocab)

document를 represent해야 되는데 vocabulary size를 어떻게 설정하느냐가 성능에 큰 영향을 끼친다 (최대한 상관성이 없는 단어들은 제외해야 한다) it is important to constrain the words to only those believed to be predictive. 

In [5]:
from collections import Counter

vocab = Counter()
# add all docs in the folder to vocab
process_docs_for_voca('txt_sentoken/pos', vocab)
process_docs_for_voca('txt_sentoken/neg', vocab)

print(len(vocab))
print(vocab.most_common(30))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238)]


data cleaning 파트에서는 document 범위내에서 한정해서 빈도수가 낮은 단어들을 삭제했지만, vocabulary를 구축하고 난 다음에는 전체 corpus 범위내에서 빈도수가 낮은 단어들을 삭제할 수 있다. (따라서 2가지 범위에 따라 빈도수가 낮은 단어들을 삭제할 수 있다)

In [6]:
min_occurance = 2 
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

25767


In [7]:
def save_vocab(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# save vocab
#save_vocab(tokens, 'voca.txt')

# 여기선 save하지않고 그대로 계속 이어가자
vocab = set(tokens)

## Train CNN

In [8]:
def clean_doc(doc, vocab):
    tokens = doc.split()
    # remove punctuations
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

def process_doc_with_split(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        # 데이터가 cv001~cv999까지 존재하는데, cv9xx 이상부터는 test set으로 사용하자 (10%)
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents

from numpy import array
def load_clean_dataset(vocab, is_train):
    neg = process_doc_with_split('txt_sentoken/neg', vocab, is_train)
    pos = process_doc_with_split('txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

from keras.preprocessing.text import Tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines) # fit_on_text함수의 입력은 list of texts이다.
    return tokenizer

from keras.preprocessing.sequence import pad_sequences
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

Using TensorFlow backend.


In [9]:
## Load vocabulary
#vocab_filename = 'voca.txt'
#vocab = load_doc(vocab_filename)
#vocab = set(vocab.split())

## Load training data
train_docs, ytrain = load_clean_dataset(vocab, True)

## Define tokenizer
tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 25768


In [10]:
#print(tokenizer.word_index)

# {'woods': 1322,
#  'spiders': 13894,
#  'darryls': 19605,
#  'hanging': 2036,
#  'woody': 924,
#  'comically': 7910,
#  'scold': 19606,
#  'originality': 2136,
#  'rickman': 7005,
#  'bringing': 1454,
#  'liaisons': 8492,
#  'sommerset': 13895,
#  'wooden': 2608,
#  'wednesday': 12257,
#  'circuitry': 16193,
#  'crotch': 8493,
#  'elgar': 19607,
# ...

# 위와 같이 tokenizer는 거치면 word index를 가지게 된다

In [11]:
## Encode data
max_length = max([len(s.split()) for s in train_docs]) # 하나의 document에 최대 word 개수 설정
print('Maximum length: %d' % max_length)
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# encode_docs 함수를 통해 sequence words를 sequence voca index로 변환해준다

Maximum length: 1317


In [12]:
## DEFINE MODEL
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

def define_model(vocab_size, max_length):
    model = Sequential()
    embedding_dim = 100
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

model = define_model(vocab_size, max_length)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
_________________________________________________________________


In [13]:
## TRAIN MODEL
model.fit(Xtrain, ytrain, epochs=10, verbose=1)
#model.save('model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bea9515a20>

## Test CNN

In [14]:
from keras.models import load_model

# load trian data
#train_docs, ytrain = load_clean_dataset(vocab, True)

# load test data
# 주의할 점은 (word, index) 정보가 있는 tokenizer는 train data와 test data 모두 똑같다
test_docs, ytest = load_clean_dataset(vocab, False)
Xtest = encode_docs(tokenizer, max_length, test_docs)

# load pre-trained model
#model = load_model('model.h5')

_, acc1 = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %f' % (acc1*100))

_, acc2 = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc2*100))

Train Accuracy: 100.000000
Test Accuracy: 87.500000


In [15]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    #print(line)
    padded = encode_docs(tokenizer, max_length, [line])
    #print(padded)
    #print(len(padded[0]))
    yhat = model.predict(padded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos)==0: # 0.5이하라면..
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

text = 'Everything will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

print('\n')
text = 'This is a bad movie. Do not watch it. It sucks.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Everything will enjoy this film. I love it, recommended!]
Sentiment: NEGATIVE (51.988%)


Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: NEGATIVE (54.339%)


## More things to do
* better data cleaning
* truncated senquences (maybe shorter max_length?)
* truncated vocabulary (maybe smaller voca_size?)
* better cnn architecture (fileters and kernel size, depth, width)
* better optimization policy (epochs and batch size)
* pretrained word embedding
* divide long and short reviews

# 2. n-gram CNN

* text classification을 위한 가장 간단한 딥러닝 모델은 word embedding layer와 one-dimensional convolutional neural network를 사용하는 것이었다. 
* different kernel size를 사용하는 multiple parallel convolutional neural networks로 확장할 수 있다. 
* 이러한 모델은 multichannel convolutional neural network와 같고 different n-gram sizes (groups of words)를 읽을 수 있다고 볼 수 있다
* This allows the document to be processed at different resolutions or different n-grams (groups of words) at a time, whilst the model learns how to best integrate these interpretations.
* Yoon Kim 2014 paper (여기서는 embedding은 안 건드리고 different kernel size만 사용(multichannel))

In [16]:
# # save dataset
# def save_dataset(dataset, filename):
#     dump(dataset, open(filename, 'wb'))
#     print('Saved: %s' % filename)

# # load dataset
# def load_dataset(filename):
#     return load(open(filename, 'rb'))

## Train n-gram CNN

In [25]:
from keras.models import Model
from keras.layers import Input
from keras.layers.merge import concatenate
from keras.layers import Dropout

def define_n_gram_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)    
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)    
    
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    model.summary()
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

model = define_n_gram_model(max_length, vocab_size)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 1317)         0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 1317)         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 1317)         0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 1317, 100)    2576800     input_16[0][0]                   
__________________________________________________________________________________________________
embedding_

In [26]:
model.fit([Xtrain, Xtrain, Xtrain], ytrain, epochs=7, batch_size=16)
#model.save('model.h5')

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1beaf11ec88>

## Test n-gram CNN

In [28]:
_, acc1 = model.evaluate([Xtrain, Xtrain, Xtrain], ytrain, verbose=0)
print('Train Accuracy: %f' % (acc1*100))

_, acc2 = model.evaluate([Xtest, Xtest, Xtest], ytest, verbose=0)
print('Test Accuracy: %f' % (acc2*100))

Train Accuracy: 100.000000
Test Accuracy: 88.500000


## More things to do...
* different n-grams (change the kernel size)
* more or fewer channels
* shared embedding (다수의 채널에 똑같은 word embedding 사용)
* 다른 pre-train word embedding 사용