<a href="https://colab.research.google.com/github/ambitama-poddar/Machine-Learning-Practicals/blob/main/RNN(long).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
Path = 'drive/My Drive/Colab Notebooks'

In [None]:
data = pd.read_csv(Path+'/Sentiment140_dataset.csv',delimiter=',',encoding='latin-1',names=['polarity','id','date','query','user','text'])
data.head(20)

In [None]:
print("Dataset shape:",data.shape)

In [None]:
data['polarity'].unique()

In [None]:
data['polarity']=data['polarity'].replace(4,1)
data.head(1600000)

In [None]:
data.describe()

In [None]:
positives=data['polarity'][data.polarity==1]
negatives=data['polarity'][data.polarity==0]

print('Total length of the data is:                    {}'.format(data.shape[0]))
print('Number of positive tagged sentences is:         {}'.format(len(positives)))
print('Number of negative tagged sentences is:         {}'.format(len(negatives)))

In [None]:
data.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)
data.info()

In [None]:
data.head(10)

In [None]:
(data.isnull().sum() / len(data))*100

In [None]:
data['text'] = data['text'].astype('str')

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stopword=set(stopwords.words('english'))
print(stopword)

In [None]:
import re
import string

urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
def process_tweets(tweet):
  # Lower Casing
  tweet = tweet.lower()
  tweet=tweet[1:]
  #Removing all UR1s
  tweet = re.sub(urlPattern,'',tweet)
  #Removing all @username.
  tweet = re.sub(userPattern,'', tweet)
  #Remove punctuations
  tweet = tweet.translate(str.maketrans("","",string.punctuation))
  #tokenizing words
  tokens = word_tokenize(tweet)
  #Removing Stop Words
  final_tokens = [w for w in tokens if w not in stopword]
  #reducing a word to its word stem
  wordLemm = WordNetLemmatizer()
  finalwords=[]
  for w in final_tokens:
    if len(w)>1:
      word = wordLemm.lemmatize(w)
      finalwords.append(word)
  return ' '.join(finalwords)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
data['processed_tweets'] = data['text'].apply(lambda x: process_tweets(x))
print('Text Preprocessing complete.')

In [None]:
data.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = data['processed_tweets'].values
Y = data['polarity'].values
print(X.shape)
print(Y.shape)

vector = TfidfVectorizer(sublinear_tf=True)
X=vector.fit_transform(X)
print(f'Vector fitted.')

print(X.shape)
print(Y.shape)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import regularizers

max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data.processed_tweets)
sequences = tokenizer.texts_to_sequences(data.processed_tweets)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(tweets,data.polarity.values,test_size=0.2,random_state=101)

In [None]:
print("X_train",X_train.shape)
print("Y_train",Y_train.shape)
print()
print("X_test",X_test.shape)
print("Y_test",Y_test.shape)

In [None]:
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
model=Sequential()
model.add(layers.Embedding(max_words, 128))
model.add(layers.LSTM(64, dropout=0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)

In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))