## Importing Necesseties

In [20]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

import pickle

from keras.preprocessing.sequence import pad_sequences

from keras.models import load_model

## Loading Data

In [7]:
print('Enter the name of file: ')
file_name = input()
location = '../data/' + file_name + '.csv'
data = pd.read_csv(location, encoding='cp949')
data.head(5)

new


Unnamed: 0,sentence,sentiment
0,but not bad,2
1,"initially, they fit great,",2
2,but the shirts shrunk about 2 sizes after wash...,2
3,i've worn a 16 1/2 x 34 dress shirt for over 2...,2
4,but this shirt did not fit me,2


## Text Preprocessing

### Cleaning the Corpus

In [8]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

In [9]:
data['sentence'] = data['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
data = data.dropna(axis=0)

data.head(5)

Unnamed: 0,sentence,sentiment
0,but not bad,2
1,initially they fit great,2
2,but the shirts shrunk about sizes after washin...,2
3,ive worn x dress shirt for over years,2
4,but this shirt did not fit me,2


### Lemmatization

In [10]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [11]:
data['sentence'] = data['sentence'].apply(lambda x:" ".join(lemmatize_all(str(x))))
data.head(5)

Unnamed: 0,sentence,sentiment
0,but not bad,2
1,initially they fit great,2
2,but the shirt shrink about size after wash and...,2
3,ive worn x dress shirt for over year,2
4,but this shirt do not fit me,2


### Tokenizing Data

In [12]:
data['x_temp'] = data['sentence'].apply(lambda x:str(x).split())

### Removing Stopwords

In [14]:
def remove_stopword(x):   
    return [y for y in x if y == 'not' or y not in stopwords.words('english')]

In [15]:
data['x_temp'] = data['x_temp'].apply(lambda x:remove_stopword(x))
data = data.dropna(axis=0)

### Integer Encoding

In [17]:
# loading tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [18]:
predict_data = data['x_temp']
predict_data = tokenizer.texts_to_sequences(predict_data)

### Padding

In [21]:
predict_data = pad_sequences(predict_data, maxlen=22)

## Loading Model

In [22]:
loaded_model = load_model('review_sentiment_analysis_model.h5')

## Prediction

In [32]:
prediction = loaded_model.predict(predict_data)

In [53]:
predicted_labels = np.argmax(prediction, axis=1)
predicted_labels = pd.DataFrame(predicted_labels)

In [54]:
data['sentiment']= predicted_labels
data

Unnamed: 0,sentence,sentiment,x_temp
0,but not bad,2,"[not, bad]"
1,initially they fit great,1,"[initially, fit, great]"
2,but the shirt shrink about size after wash and...,0,"[shirt, shrink, size, wash, dry, long, fit]"
3,ive worn x dress shirt for over year,0,"[ive, worn, x, dress, shirt, year]"
4,but this shirt do not fit me,2,"[shirt, not, fit]"
...,...,...,...
877,but this shirt after three wash be still like ...,0,"[shirt, three, wash, still, like, steel, wool,..."
878,the sleeve and neck fit correctly,0,"[sleeve, neck, fit, correctly]"
879,but this be suppose to be tall shirt and the s...,0,"[suppose, tall, shirt, shirt, not, long, enough]"
880,fit as good as can guess from an online purcha...,1,"[fit, good, guess, online, purchase, little, s..."


In [55]:
labeled_file_name = 'sentiment_labeled_' + file_name + '.csv'
data.to_csv(labeled_file_name, columns = ['sentence', 'sentiment'], index=False) 