In [25]:
import xml.etree.ElementTree as ET
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mutual_info_score
import pickle
import os

def XML2lists(path):
    reviews = []
    negReviews = []
    posReviews = []

    neg_tree = ET.parse(path +"/negative.parsed")
    neg_root = neg_tree.getroot()
    for rev in neg_root.iter('review'):
        reviews.append(rev.text)
        negReviews.append(rev.text)

    pos_tree = ET.parse(path +"/negative.parsed")
    pos_root = pos_tree.getroot()
    for rev in pos_root.iter('review'):
        reviews.append(rev.text)
        posReviews.append(rev.text)

    return reviews,negReviews,posReviews


In [87]:
import random
import nltk

src = 'books'
path = '/home/a/amirfeder/Downloads/Sentiment/data/' + src

reviews,neg_reviews,pos_reviews = XML2lists(path)
reviews_labels = [0]*len(neg_reviews) + [1]*len(pos_reviews)

reviews_no_jj = []
for review in reviews:
    cur_review = []
    tokens = nltk.word_tokenize(review)
    pos_tags = nltk.pos_tag(tokens)
    for i in range(len(tokens)):
        if pos_tags[i][1][:2] != 'JJ':
            cur_review.append(tokens[i])
    reviews_no_jj.append(' '.join(cur_review))
            
print(reviews_no_jj[0])
print(reviews[0])



In [88]:
reviews_tuples = [(reviews[i], reviews_no_jj[i], reviews_labels[i]) for i in range(len(reviews_labels))]

random.shuffle(reviews_tuples)
X = [item[0] for item in reviews_tuples]
X_no_jj = [item[1] for item in reviews_tuples]
y = [item[2] for item in reviews_tuples]

print(X[0], X_no_jj[0], y[0])

	Poor Content
	The book is manipulative and unrealistic. I felt the author was blatantly trying to persuade the reader to believe that infieldey, betrayal and lack of morals is acceptable if you have a self absorbed best friend.  Definitely one of the shallowest books I have ever read.  I bought the book as it was rated as a "best seller" , and I was stuck in an airport for hours on end. The entire plot is hideous as the "burned best friend" makes up reasons why it's accept able to betray her best friend. If I was not completely bored, I would have given up on the book after the first 20 pages.
	 Poor Content The book is and . I felt the author was blatantly trying to persuade the reader to believe that infieldey , betrayal and lack of morals is if you have a self absorbed friend . Definitely one of the books I have ever read . I bought the book as it was rated as a `` seller '' , and I was stuck in an airport for hours on end . The plot is as the `` friend '' makes up reasons why it '

In [89]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(X[:1500])
reviews_test_clean = preprocess_reviews(X[1500:])
reviews_no_jj_train_clean = preprocess_reviews(X_no_jj[:1500])
reviews_no_jj_test_clean = preprocess_reviews(X_no_jj[1500:])

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)

cv.fit(reviews_train_clean)
X_train = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

# NO JJ
cv.fit(reviews_no_jj_train_clean)
X_no_jj_train = cv.transform(reviews_no_jj_train_clean)
X_no_jj_test = cv.transform(reviews_no_jj_test_clean)

y_train = y[:1500]
y_test = y[1500:]

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("LR Accuracy for C=%s: %s" 
           % (c, 1-accuracy_score(y_test, lr.predict(X_test))))
rf = RandomForestClassifier(n_estimators=1000, max_depth=2,random_state=0)
rf.fit(X_train, y_train)
print ("RF Accuracy: %s" 
       % (1-accuracy_score(y_test, rf.predict(X_test))))
    
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_no_jj_train, y_train)
    print ("LR No JJ Accuracy for C=%s: %s" 
           % (c, 1-accuracy_score(y_test, lr.predict(X_no_jj_test))))
rf = RandomForestClassifier(n_estimators=1000, max_depth=2,random_state=0)
rf.fit(X_no_jj_train, y_train)
print ("RF No JJ Accuracy: %s" 
       % (1-accuracy_score(y_test, rf.predict(X_no_jj_test))))





LR Accuracy for C=0.01: 0.838
LR Accuracy for C=0.05: 0.86
LR Accuracy for C=0.25: 0.864
LR Accuracy for C=0.5: 0.864
LR Accuracy for C=1: 0.864
RF Accuracy: 0.6799999999999999
LR No JJ Accuracy for C=0.01: 0.832
LR No JJ Accuracy for C=0.05: 0.85
LR No JJ Accuracy for C=0.25: 0.864
LR No JJ Accuracy for C=0.5: 0.864
LR No JJ Accuracy for C=1: 0.864




RF No JJ Accuracy: 0.6699999999999999


# Keras

In [139]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
df = pd.read_csv('/home/a/amirfeder/Downloads/Sentiment/data/movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [140]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [141]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [142]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews) 

# pad sequences
max_length = max([len(s.split()) for s in total_reviews])

# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [143]:
print(vocab_size)

125602


In [144]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2678, 100)         12560200  
_________________________________________________________________
gru_3 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 12,573,001
Trainable params: 12,573,001
Non-trainable params: 0
_________________________________________________________________
None


In [145]:
print('Train...')
model.fit(X_train_pad, y_train, batch_size=128, epochs=5, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
 - 396s - loss: 0.6934 - acc: 0.5008 - val_loss: 0.6932 - val_acc: 0.5000
Epoch 2/5
 - 369s - loss: 0.6934 - acc: 0.4980 - val_loss: 0.6931 - val_acc: 0.5000
Epoch 3/5
 - 367s - loss: 0.6933 - acc: 0.5020 - val_loss: 0.6934 - val_acc: 0.5000
Epoch 4/5
 - 366s - loss: 0.6934 - acc: 0.4952 - val_loss: 0.6933 - val_acc: 0.5000
Epoch 5/5
 - 366s - loss: 0.6932 - acc: 0.4990 - val_loss: 0.6932 - val_acc: 0.5000


<keras.callbacks.History at 0x7f3a08626a90>

In [146]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.6931521001243591
Test accuracy: 0.5
Accuracy: 50.00%
