In [286]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from nltk.corpus import stopwords,words,brown
from nltk import word_tokenize, sent_tokenize, pos_tag
from string import punctuation
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense, Embedding, Dropout, Conv1D, MaxPooling1D, Activation, Bidirectional
from keras.models import Model
from sklearn.metrics import confusion_matrix
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
import nltk
from keras import layers, regularizers
from keras.models import Sequential
from keras import models
from keras.preprocessing.text import Tokenizer
from sklearn import metrics

### Reading data

In [287]:
data = pd.read_csv('women.csv', encoding = 'utf8')

In [288]:
texts = data['Review Text'].astype(str)
ratings = data['Recommended IND'].astype(int)

In [289]:
texts0 = []
ratings0 = []
texts1 = []
ratings1 = []
for text, rating in zip(texts, ratings):
    if rating == 0:
        texts0.append(text)
        ratings0.append(rating)
    else:
        texts1.append(text)
        ratings1.append(rating)

In [290]:
texts0.extend(texts1[0:len(texts0)])
ratings0.extend(ratings1[0:len(ratings0)])

In [291]:
X = texts0
Y = ratings0

In [292]:
from keras.utils import to_categorical
Y = to_categorical(Y)

### Lemmatizing and removing stop words 

In [293]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
         return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement



def filter_stop_words(train_sentences, stop_words):
    for i, sentence in enumerate(train_sentences):
        tokens = word_tokenize(sentence)
        lemmas = []
        lemmatizer = WordNetLemmatizer()
        tagged = nltk.pos_tag(tokens)
        for word, tag in tagged:
            if word not in stop_words:
                wntag = get_wordnet_pos(tag)
                if wntag is None:# not supply tag in case of None
                    lemma = lemmatizer.lemmatize(word) 
                else:
                    lemma = lemmatizer.lemmatize(word, pos=wntag)
                lemmas.append(lemma)
        train_sentences[i] = ' '.join(lemmas)
    return train_sentences

stop_words = set(stopwords.words("english"))
X = filter_stop_words(X, stop_words)

In [294]:
max_len = 50
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = pad_sequences(sequences,maxlen=max_len, padding = 'post')
train_data = np.array(sequences_matrix)

In [295]:
X_train,X_test,Y_train,Y_test = train_test_split(train_data,Y,test_size=0.15)

### Creating matrix of pre-trained word embeddings from GloVe

In [296]:
embeddings_index = {}
f = open(r"C:\Users\Abhishek\Downloads\glove.6B\glove.6B.100d.txt", encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [331]:
embedding_matrix = np.zeros((len(tok.word_index) + 1, 100))
for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [332]:
from keras.layers import Embedding

embedding_layer = Embedding(len(tok.word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=50,
                            trainable=True)

In [333]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32, dropout = 0.2, kernel_regularizer=regularizers.l2(0.01))))
model.add(Dense(Y.shape[1], activation='softmax'))
from keras.optimizers import SGD
opt = SGD(lr=0.001)
model.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = ['categorical_accuracy'])
model.fit(X_train,Y_train,batch_size=128,epochs=25,validation_split = 0.2)

Train on 5673 samples, validate on 1419 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a274954f60>

In [334]:
Y_pred = model.predict(X_test)
matrix = metrics.confusion_matrix(Y_test.argmax(axis=1), Y_pred.argmax(axis=1))
matrix

array([[573,  76],
       [100, 503]], dtype=int64)

In [335]:
f1_score = metrics.f1_score(Y_test.argmax(axis=1), Y_pred.argmax(axis=1), average = 'weighted')
accuracy = metrics.accuracy_score(Y_test.argmax(axis=1), Y_pred.argmax(axis=1))
precision = metrics.precision_score(Y_test.argmax(axis=1), Y_pred.argmax(axis=1), average = 'weighted')
recall = metrics.recall_score(Y_test.argmax(axis=1), Y_pred.argmax(axis=1), average = 'weighted')

In [336]:
print('The accuracy is ' + str(("%.2f" % accuracy)) + ', the precision is ' + str(("%.2f" % precision)) + ', the recall is ' + str(("%.2f" % precision)) + ' and the f1 score is ' + str(("%.2f" % f1_score)) + '.')

The accuracy is 0.86, the precision is 0.86, the recall is 0.86 and the f1 score is 0.86.
