In [75]:
import csv
import pandas as pd
import random
import pickle
import numpy as np
import tensorflow as tf
import zipfile
import warnings
warnings.simplefilter("ignore")


from tensorflow.keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [76]:
EMBEDDING_DIM = 100
MAXLEN = 16
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 5000
TRAINING_SPLIT = 0.9

In [12]:
!wget https://github.com/aldofdp07/Sentiment-Analysis-on-Movie-Review/blob/main/sentiment-analysis-on-movie-reviews.zip

--2022-11-06 11:53:14--  https://github.com/aldofdp07/Sentiment-Analysis-on-Movie-Review/blob/main/Dataset/sampleSubmission.csv
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘sampleSubmission.csv’

sampleSubmission.cs     [ <=>                ] 149.03K  --.-KB/s    in 0.1s    

2022-11-06 11:53:15 (1.50 MB/s) - ‘sampleSubmission.csv’ saved [152602]

--2022-11-06 11:53:15--  https://github.com/aldofdp07/Sentiment-Analysis-on-Movie-Review/blob/main/Dataset/test.tsv
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘test.tsv’

test.tsv                [ <=>                ] 143.80K  --.-KB/s    in 0.07s   

2022-11-06 11:53:15 (1.94 MB/s) - ‘test.tsv’ saved [147250]

--20

In [19]:
!unzip /content/sentiment-analysis-on-movie-reviews.zip

Archive:  /content/sentiment-analysis-on-movie-reviews.zip
replace sampleSubmission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sampleSubmission.csv    
  inflating: test.tsv.zip            
  inflating: train.tsv.zip           


In [77]:
train = pd.read_csv('/content/train.tsv.zip',sep = '\t')
test = pd.read_csv('/content/test.tsv.zip',sep = '\t')

train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [79]:
def train_val_split(sentences, labels, training_split):
    
    train_data, validation_data, train_labels, validation_labels = train_test_split(sentences, labels, train_size=training_split, random_state=24)
    return train_data, validation_data, train_labels, validation_labels
    
    return train_data, validation_data, train_labels, validation_labels

In [80]:
# Test your function
sentences = train["Phrase"]
labels = train["Sentiment"]
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)

print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")

There are 140454 sentences for training.

There are 140454 labels for training.

There are 15606 sentences for validation.

There are 15606 labels for validation.


In [81]:
def fit_tokenizer(train_sentences, oov_token):
    
    ### START CODE HERE
        # Instantiate the Tokenizer class, passing in the correct values for num_words and oov_token
    tokenizer = Tokenizer(oov_token = oov_token, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        # Fit the tokenizer to the training sentences
    tokenizer.fit_on_texts(train_sentences)
    ### END CODE HERE  
    return tokenizer

In [82]:
# Test your function
tokenizer = fit_tokenizer(train_sentences, OOV_TOKEN)

word_index = tokenizer.word_index
vocab_size = len(word_index)

print(f"Vocabulary contains {vocab_size} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 15284 words

<OOV> token included in vocabulary


In [83]:
print(word_index)



In [84]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    
    ### START CODE HERE     
    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)
    # Pad the sequences using the correct padding, truncating and maxlen
    pad_trunc_sequences = pad_sequences(sequences, padding=padding, truncating=truncating, maxlen=maxlen)
    ### END CODE HERE
    
    return pad_trunc_sequences

In [85]:
# Test your function
train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)

print(f"Padded and truncated training sequences have shape: {train_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {val_pad_trunc_seq.shape}")

Padded and truncated training sequences have shape: (140454, 16)

Padded and truncated validation sequences have shape: (15606, 16)


In [86]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [87]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
val_labels = le.fit_transform(val_labels)

In [88]:
# GRADED FUNCTION: create_model
def create_model(vocab_size, embedding_dim, maxlen):
    
    ### START CODE HERE
    model = tf.keras.Sequential([ 
        # This is how you need to set the Embedding layer when using pre-trained embeddings
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, trainable=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

   # model = Sequential()
    #model.add(Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=True))
    #model.add(SpatialDropout1D(0.5))
    #model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
    #model.add(Dense(32, activation='relu'))
    #model.add(Dense(6, activation='softmax'))
    
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam',
                  metrics=['accuracy']) 

    ### END CODE HERE
    return model

In [89]:
# Create your untrained model
model = create_model(vocab_size, EMBEDDING_DIM, MAXLEN)

# Train the model and save the training history
history = model.fit(train_pad_trunc_seq, train_labels, epochs=15, validation_data=(val_pad_trunc_seq, val_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [90]:
sent = test["Phrase"]

tokenizer = fit_tokenizer(sent, OOV_TOKEN)

test_pad_trunc_seq = seq_pad_and_trunc(sent, tokenizer, PADDING, TRUNCATING, MAXLEN)


In [91]:
x = np.argmax(model.predict(test_pad_trunc_seq), axis=-1)



In [94]:
test['Sentiment']=x

In [95]:
test = test.drop(['Phrase','SentenceId'],axis = 1)

In [96]:
test.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [97]:
test.to_csv('submission_Aldofdp.csv')