-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentiment_Analysis_training.py
205 lines (158 loc) · 6.87 KB
/
Sentiment_Analysis_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""Akash_Maurya_multi_hot_sentiment .ipynb
Automatically generated by Colaboratory.
# Text classification
## Sentiment analysis
It is a natural language processing problem where text is understood and the underlying intent is predicted. Here, you need to predict the sentiment of movie reviews as either positive or negative in Python using the Keras deep learning library.
## Data description
The dataset is the Large Movie Review Dataset often referred to as the IMDB dataset.
The [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/) (often referred to as the IMDB dataset) contains 25,000 highly polar movie reviews (good or bad) for training and the same amount again for testing. The problem is to determine whether a given moving review has a positive or negative sentiment. Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers).
## Loading dataset
First, we will load complete dataset and analyze some properties of it.
"""
import numpy as np
import h5py
import numpy
import keras
from keras import regularizers,layers
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# np.load is used inside imdb.load_data. But imdb.load_data still assumes the default
# values of an older version of numpy. So necessary changes to np.load are made
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load Numpy
np_load_old = np.load
# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
# call load_data with allow_pickle implicitly set to true
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
# restore np.load for future normal usage
np.load = np_load_old
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
print("Size of X")
print(X.shape)
print("Size of X_train")
print(X_train.shape)
print("Size of y")
print(y.shape)
print("Size of y_train")
print(y_train.shape)
"""## **Let's see some of reviews.**"""
print("## **Let's see some of reviews.**")
word_to_id = keras.datasets.imdb.get_word_index()
id_to_word = {value:key for key,value in word_to_id.items()}
for i in range(15,17):
print("******************** REVIEW_EXAMPLE ************************")
print(' '.join(id_to_word.get(id - 3, '?')for id in X_train[i] ))
"""## Summarize the data
1) Find out the number of classes in label (*y* array)? <br>
2) Find out number of unique words in dataset *X*? <br>
3) Calculate the list of review length , report mean and standard deviation. <br>
"""
def summarize_data():
"""
Output:
classes: list, list of unique classes in y
no_of_words: int, number of unique words in dataset x
list_of_review_lengths: list, list of lengths of each review
mean_review_length: float, mean(list_of_review_lengths), a single floating point value
std_review_length: float, standard_deviation(list_of_review_lengths), a single floating point value
"""
import statistics
classes = np.unique(y)
no_of_words = len(np.unique(np.concatenate(X)))
list_of_review_lengths = [len(i) for i in X]
mean_review_length = statistics.mean(list_of_review_lengths)
std_review_length = statistics.stdev(list_of_review_lengths)
return classes, no_of_words, list_of_review_lengths, mean_review_length, std_review_length
'''Call the function summarize_data'''
classes, no_of_words, list_of_review_lengths, mean_review_length, std_review_length = summarize_data()
"""## One hot encode the output data"""
def one_hot(y):
"""
Inputs:
y: numpy array with class labels
Outputs:
y_oh: numpy array with corresponding one-hot encodings
"""
oh = []
for i in range(0, len(y)):
if y[i] == 0:
oh.append([1, 0])
else:
oh.append([0, 1])
y_oh = np.array(oh)
return y_oh
#call the function one_hot
y_train = one_hot(y_train)
y_test = one_hot(y_test)
"""### Multi-hot encode the input data
All sequences are of different length and our vocabulory size is 10K. <br>
**To Do**
1) Intialize vector of dimension 10,000 with value 0. <br>
2) For those tokens in a sequence which are present in Vocabulary make that position as 1 and keep all other positions filled with 0. <br>
For example, lets take Vocabulary = ['I': 0, ':1, 'eat: 2:' mango: 3, 'fruit':4, 'happy':5, 'you':6] <br>
We have two sequnces and
Multi-hot encoding of both sequences will be of dimension: 7 (vocab size).<br>
1) *Mango is my favourite fruit* becomes *Mango ? ? ? fruit* after removing words which are not in my vocabulary. Hence multi hot encoding will have two 1's corresponding to mango and fruit i.e, [0, 0, 0, 1, 1, 0, 0] <br>
Similarly, <br>
2) *I love to eat mango* = *I ? ? eat mango* = [1, 1, 0, 1, 0, 0, 0]
"""
def multi_hot_encode(sequences, dimension):
"""
Input:
sequences: list of sequences in X_train or X_test
Output:
results: mult numpy matrix of shape(len(sequences), dimension)
"""
results = np.zeros((len(sequences), dimension))
for i in range(len(sequences)):
for j in range(len(sequences[i])):
results[i][sequences[i][j]] = 1
return results
'''call the function multi_hot_encode'''
x_train = multi_hot_encode(X_train, 10000)
x_test = multi_hot_encode(X_test, 10000)
"""## Split the data into train and validation"""
from sklearn.model_selection import train_test_split
x_strat, x_dev, y_strat, y_dev = train_test_split(x_train, y_train,test_size=0.40,random_state=0, stratify=y_train)
"""## Build Model
Build a multi layered feed forward network in keras.
### Create the model
"""
def create_model():
"""
Output:
model: A compiled keras model
"""
model = Sequential()
model.add(Embedding(10000, 32, input_length = 10000))
model.add(Flatten())
model.add(Dense(32, activation='tanh'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
return model
model = create_model()
print(model.summary())
"""### Fit the Model"""
def fit(model):
"""
Action:
Fit the model created above using training data as x_strat and y_strat
and validation_data as x_dev and y_dev, verbose=2 and store it in 'history' variable.
evaluate the model using x_test, y_test, verbose=0 and store it in 'scores' list
Output:
scores: list of length 2
history_dict: output of history.history where history is output of model.fit()
"""
history = model.fit(x_strat, y_strat, validation_data=(x_dev, y_dev), epochs=15, batch_size=128, verbose=1)
scores = model.evaluate(x_test, y_test, verbose=0)
history_dict = history.history
return scores,history_dict
scores,history_dict = fit(model)
model.save("Your_Model.h5")