<a href="https://colab.research.google.com/github/archana3001/DeepLearning_Beginner/blob/master/implementation_of_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**text preprocessing and word embedding**

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import one_hot
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from sklearn import preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df=pd.read_csv('/content/drive/MyDrive/AILAB/Constraint_English_Train - Constraint_English_Train.csv')
print(df.shape)
df.head()

(6420, 3)


Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [3]:
#text preprocessing and cleaning
corpus=[]
for i in range(0, len(df)):
  tweet=re.sub('[^a-zA-Z]', ' ',df['tweet'][i])
  tweet=tweet.lower()
  tweet=tweet.split()
  tweet=[word for word in tweet if not word in stopwords.words('english')]
  tweet=' '.join(tweet)
  corpus.append(tweet)

#one hot representation
voc_size=10000
one_hot_rep=[one_hot(words, voc_size) for words in corpus]

#embedding
embedded_doc=pad_sequences(one_hot_rep, padding='pre', maxlen=50) #we are making each sentences of word 50 if len(sentence)<50 we add 0 as prefix words
dim=100  #here dimension is no of features we want for embedding
model=Sequential()
model.add(Embedding(voc_size, dim, input_length=50)) #vocab_size, dimension and sentence length are parameters
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # compile the model
#print(model.summary())  # summarize the model

# let's see how our embedded doc is fitted to embedding model 
X=embedded_doc
label_encoder = preprocessing.LabelEncoder()
df['label']= label_encoder.fit_transform(df['label'])
y=df['label']

In [4]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.33, random_state=0)

In [5]:
#Scaling test and train data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [6]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_train.shape[1]

50

 
##**Each RNN cell takes one data input and one hidden state which is passed from a one-time step to the next.**

**Simple RNN**

In [40]:
from keras.layers import SimpleRNN 
RNNmodel=Sequential()
RNNmodel.add(SimpleRNN(units=10, activation='tanh', input_shape = (X_train.shape[1], X_train.shape[2])))
RNNmodel.add(Dense(units=1, kernel_initializer='he_normal', activation='sigmoid'))
RNNmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = RNNmodel.fit(X_train, y_train, batch_size=10, epochs=21)

Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21
Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21
Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21
Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21


In [43]:
#predicting 
y_pred_model = RNNmodel.predict(X_test)

In [44]:
y_pred_model =y_pred_model.flatten()
y_pred_model =(y_pred_model >0.5)

In [45]:
y_pred_model

array([False,  True, False, ...,  True,  True, False])

In [46]:
#confusion matrix and accuracy score for best_model we get after hyperparameter tuning in keras
from sklearn.metrics import confusion_matrix, accuracy_score
cm_sc=confusion_matrix(y_test, y_pred_model)
#calculating accuracy
score=accuracy_score(y_pred_model, y_test)

print("confusion matrix : \n", cm_sc)
print("score : ",score)

confusion matrix : 
 [[707 311]
 [287 814]]
score :  0.7177914110429447


##**LSTM**

In [33]:
#The LSTM transforms the vector sequence into a single vector of size lstm_out, containing information about the entire sequence.
from keras.layers import LSTM
lstm_out=200
lstm_model = Sequential()
lstm_model.add(LSTM(lstm_out))
lstm_model.add(Dense(1,activation='sigmoid'))
lstm_model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
lstm_model_history = lstm_model.fit(X_train, y_train,  batch_size=10, epochs=21)

Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21
Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21
Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21
Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21


In [41]:
#predicting 
y_pred_model_lstm = lstm_model.predict(X_test)
y_pred_model_lstm =y_pred_model_lstm.flatten()
y_pred_model_lstm =(y_pred_model_lstm >0.5)
y_pred_model_lstm

array([False,  True, False, ...,  True,  True, False])

In [42]:
#confusion matrix and accuracy score for best_model we get after hyperparameter tuning in keras
from sklearn.metrics import confusion_matrix, accuracy_score
cm_lstm=confusion_matrix(y_test, y_pred_model)
#calculating accuracy
score_lstm=accuracy_score(y_pred_model_lstm, y_test)

print("confusion matrix : \n", cm_lstm)
print("score : ",score_lstm)

confusion matrix : 
 [[645 373]
 [220 881]]
score :  0.7229825389334592
