In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv
/kaggle/input/amazonyelptwittersentiment/amazon_yelp_twitter2.csv
/kaggle/input/amazonyelptwittersentiment/amazon_yelp_twitter.csv


## Importing the dataset

In [2]:
df = pd.read_csv('/kaggle/input/amazonyelptwittersentiment/amazon_yelp_twitter.csv',header=None)
df = df.dropna()
df.columns = ['sentiment', 'text']
df.head()

Unnamed: 0,sentiment,text
0,1,nearly perfect wheat-free bread mix
1,0,be aware: speakers not as advertised on amazon
2,0,results comedic at best
3,1,going to enjoy the sunshine while its here
4,1,i feel better now.


## Feature Engineering

In [4]:
y = df['sentiment']

## Splitting the dataset in train and test split

In [5]:
from sklearn.model_selection import train_test_split
df_train,df_test,y_train,y_test=train_test_split(df['text'],y,train_size=0.2,test_size=0.05,random_state=42)
print('DF Train Shape: ',df_train.shape)
print('DF Test Shape: ',df_test.shape)
print('Y Train Shape: ',y_train.shape)
print('Y Test Shape: ',y_test.shape)


DF Train Shape:  (897722,)
DF Test Shape:  (224431,)
Y Train Shape:  (897722,)
Y Test Shape:  (224431,)


In [6]:
y_train.value_counts()

1    460097
0    437625
Name: sentiment, dtype: int64

## Building deep learn model

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_words=10000
tokenizer=Tokenizer(max_words)
tokenizer.fit_on_texts(df_train)
sequence_train=tokenizer.texts_to_sequences(df_train)
sequence_test=tokenizer.texts_to_sequences(df_test)

In [8]:
word2vec=tokenizer.word_index
V=len(word2vec)
print('dataset has %s number of independent tokens' %V)

dataset has 191384 number of independent tokens


In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
data_train=pad_sequences(sequence_train)
data_train.shape

(897722, 39)

In [10]:
T=data_train.shape[1]
data_test=pad_sequences(sequence_test,maxlen=T)
data_test.shape

(224431, 39)

In [12]:
from tensorflow.keras.layers import Input,Conv1D,MaxPooling1D,Dense,GlobalMaxPooling1D,Embedding,Bidirectional,LSTM
from tensorflow.keras.models import Model, Sequential, model_from_json

In [14]:
D=20
i=Input((T,))
x=Embedding(V+1,D)(i)
x=Conv1D(32,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(64,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(1,activation='sigmoid')(x)
model=Model(i,x)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 39)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 39, 20)            3827700   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 37, 32)            1952      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 12, 32)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 10, 64)            6208      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 3, 64)             0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1, 128)            24704 

## Training the model (CNN)

In [21]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
cnn_senti=model.fit(data_train,y_train,validation_data=(data_test,y_test),epochs=10,batch_size=100)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Training the model (BiLSTM)

In [15]:
model = Sequential()
D=20
model.add(Embedding(V+1, D, input_length=T))
lstm_out = 64
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 39, 20)            3827700   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               43520     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 3,872,521
Trainable params: 3,872,521
Non-trainable params: 0
_________________________________________________________________


In [17]:
lstm_senti=model.fit(data_train,y_train,validation_data=(data_test,y_test),epochs=10,batch_size=100)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:

model.save("bilstm")
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [19]:
!zip -r output.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/model.json (deflated 75%)
  adding: kaggle/working/model.h5 (deflated 9%)
  adding: kaggle/working/__notebook_source__.ipynb (deflated 44%)
  adding: kaggle/working/bilstm/ (stored 0%)
  adding: kaggle/working/bilstm/variables/ (stored 0%)
  adding: kaggle/working/bilstm/variables/variables.index (deflated 67%)
  adding: kaggle/working/bilstm/variables/variables.data-00001-of-00002 (deflated 6%)
  adding: kaggle/working/bilstm/variables/variables.data-00000-of-00002 (deflated 66%)
  adding: kaggle/working/bilstm/assets/ (stored 0%)
  adding: kaggle/working/bilstm/saved_model.pb (deflated 91%)


In [28]:
lstm_senti=model.fit(data_train,y_train,validation_data=(data_test,y_test),epochs=10,batch_size=100)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred=model.predict(data_test)
y_pred

In [None]:
y_pred=np.argmax(y_pred,axis=1)
y_pred

## Scoring 

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
import seaborn as sns

In [None]:
cm=confusion_matrix(y_test,y_pred)
ax=sns.heatmap(cm,annot=True,cmap='Blues',fmt=' ')
ax.set_title('Confusion Matrix')
ax.set_xlabel('y_test')
ax.set_ylabel('y_pred')


In [None]:
print(classification_report(y_test,y_pred))