In [1]:
import pandas as pd
import numpy as np
import glob, os, string, re, spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

  return f(*args, **kwds)


In [2]:
train_pos_files = glob.glob("datasets/train/pos/*.txt")
train_neg_files = glob.glob("datasets/train/neg/*.txt")
train_pos_ls = []
for i in train_pos_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_pos_ls.append(str)
    
train_neg_ls = []
for i in train_neg_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_neg_ls.append(str)
    


In [3]:
labels = ['reveiw', 'label']
df_train_pos = pd.DataFrame()
df_train_pos['review'] = train_pos_ls
df_train_pos['label'] = 1
df_train_neg = pd.DataFrame()
df_train_neg['review'] = train_neg_ls
df_train_neg['label'] = -1
df_train = pd.concat([df_train_pos , df_train_neg])


In [4]:
test_pos_files = glob.glob("datasets/test/pos/*.txt")
test_neg_files = glob.glob("datasets/test/neg/*.txt")
test_pos_ls = []
for i in test_pos_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_pos_ls.append(str)
    
test_neg_ls = []
for i in test_neg_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_neg_ls.append(str)
    

In [5]:
labels = ['reveiw', 'label']
df_test_pos = pd.DataFrame()
df_test_pos['review'] = test_pos_ls
df_test_pos['label'] = 1
df_test_neg = pd.DataFrame()
df_test_neg['review'] = test_neg_ls
df_test_neg['label'] = -1
df_test = pd.concat([df_test_pos , df_test_neg])
df_test.head()

Unnamed: 0,review,label
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [6]:
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))

# nltk stopwords removal performs better than spacy 
# nlp = spacy.load('en_core_web_sm')
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
            
def text_prep(text):
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [lemma.lemmatize(text, pos='v') for text in text.lower().split() if text not in stops] 
    text = " ".join(text)
    return (text)


In [7]:
# preprocess training data
df_train['prep_review'] = df_train['review'].apply(lambda x:text_prep(x))
df_train[['prep_review', 'label']].head()

Unnamed: 0,prep_review,label
0,movie get respect sure lot memorable quote lis...,1
1,bizarre horror movie fill famous face steal cr...,1
2,solid unremarkable film matthau einstein wonde...,1
3,strange feel sit alone theater occupy parent r...,1
4,probably already know 5 additional episodes ne...,1


In [8]:
# preprocess testing data
df_test['prep_review'] = df_test['review'].apply(lambda x:text_prep(x))
df_test[['prep_review', 'label']].head()

Unnamed: 0,prep_review,label
0,base actual story john boorman show struggle a...,1
1,gem film four production anticipate quality in...,1
2,really like show drama romance comedy roll one...,1
3,best 3d experience disney themeparks certainly...,1
4,korean movies ive see three really stick first...,1


In [9]:
# Vectorizing training data 
tfidf = TfidfVectorizer()
# tfidf = TfidfVectorizer(ngram_range = (1,3)) did not improve accuracy
x_train = tfidf.fit_transform(df_train['prep_review'])
y_train = df_train['label']

In [10]:
# Vectorizing testing data 
x_test = tfidf.transform(df_test['prep_review'])
y_test = df_test['label']

In [11]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(solver = 'lbfgs', n_jobs = -1)
LR.fit(x_train, y_train)
LR_clf = LR.predict(x_test)
print (LR.score(x_train, y_train))
print (accuracy_score(y_test, LR_clf))


0.93528
0.88336


In [12]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)
DT_clf = DT.predict(x_test)
print (DT.score(x_train, y_train))
print (accuracy_score(y_test, DT_clf))

1.0
0.7096


In [13]:
# slow and low accuracy
# from sklearn.svm import SVC
# SVM = SVC(gamma = 'scale')
# SVM.fit(x_train, y_train)
# SVM_clf = SVM.predict(x_test)
# print (SVM.score(x_train, y_train))
# print (SVM.score(x_test, y_test))

In [14]:
from sklearn.svm import LinearSVC
LSVM = LinearSVC()
LSVM.fit(x_train, y_train)
LSVM_clf = LSVM.predict(x_test)
print (LSVM.score(x_train, y_train))
print (accuracy_score(y_test, LSVM_clf))

0.99128
0.87264


In [15]:
from sklearn.ensemble import AdaBoostClassifier
ADA = AdaBoostClassifier(n_estimators=100)
ADA.fit(x_train, y_train)
ADA_clf = ADA.predict(x_test)
print (ADA.score(x_train, y_train))
print (accuracy_score(y_test, ADA_clf))

0.84076
0.83188


In [17]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs = -1)
RFC.fit(x_train, y_train)
RFC_clf = RFC.predict(x_test) 
print (RFC.score(x_train, y_train))
print (accuracy_score(y_test, RFC_clf))

1.0
0.84712


In [18]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(x_train, y_train)
MNB_clf = MNB.predict(x_test)
print (MNB.score(x_train, y_train))
print (accuracy_score(y_test, MNB_clf))

0.9172
0.83308


In [19]:
print(classification_report(y_test, LR_clf))

              precision    recall  f1-score   support

          -1       0.89      0.88      0.88     12500
           1       0.88      0.89      0.88     12500

   micro avg       0.88      0.88      0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [20]:
from keras.preprocessing import sequence

tfidf_NN = TfidfVectorizer(max_features = 1000)
x_train_NN = tfidf_NN.fit_transform(df_train['prep_review'])
y_train_NN = df_train['label']
x_test_NN = tfidf_NN.transform(df_test['prep_review'])
y_test_NN = df_test['label']
x_train_NN.shape

Using TensorFlow backend.


(25000, 1000)

In [21]:
from keras.models import Sequential
from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense

model = Sequential()
model.add(Dense(256, input_shape=(1000,) , activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(160, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(120, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(80, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               256256    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               51400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 160)               32160     
_________________________________________________________________
dropout_3 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 120)               19320     
__________

In [22]:
model.fit(x_train_NN, y_train_NN, batch_size=128, epochs=10, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a27337e48>

In [23]:
loss, accuracy = model.evaluate(x_train_NN, y_train_NN)
print (loss, accuracy)

-7.010489030947685 0.45948


In [24]:
predictions = model.predict(x_test_NN)
# round predictions
rounded = [round(x[0]) for x in predictions]
predictions = rounded
# print (predictions)
score = accuracy_score(y_test_NN ,predictions)
print(score)


0.40984
