# PART A

In [76]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant 
from tensorflow.keras.optimizers import Adam

### Import and analyse the data set.

In [77]:
vocab_size = 10000

In [78]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = vocab_size)

### Perform relevant sequence adding on the data.

In [79]:
max_len = 300
train_data = pad_sequences(train_data, maxlen = max_len)
test_data = pad_sequences(test_data, maxlen = max_len)

### Print shape of features and labels

In [80]:
print("Shape of features (train_data):", train_data.shape)
print("Shape of labels (train_labels):", train_labels.shape)

Shape of features (train_data): (25000, 300)
Shape of labels (train_labels): (25000,)


### Print value of any one feature and it's label

In [81]:
print("Example feature (encoded review):", train_data[0])
print("Label for the example feature:", train_labels[0])

Example feature (encoded review): [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    1   14
   22   16   43  530  973 1622 1385   65  458 4468   66 3941    4  173
   36  256    5   25  100   43  838  112   50  670    2    9   35  480
  284    5  150    4  172  112  167    2  336  385   39    4  172 4536
 1111   17  546   38   13  447    4  192   50   16    6  147 2025   19
   14   22    4 1920 4613  469    4   22   71   87   12   16   43  530
   38   76   15   13 1247    4   22   17  515   17   12   16  626   18
    2    5   62  386   12    8  316    8  106    5    4 2223 5244   16
  480   66 3785   33    4  130   12   16   

### Decode the feature value to get original sentence

In [82]:
data = np.concatenate((X_train, X_test), axis=0)
label = np.concatenate((y_train, y_test), axis=0)

In [83]:
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3, '#') for i in train_data[0]])
print("Decoded Review:", decoded_review)

Decoded Review: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i thin

### Design, train, tune and test a sequential model.

In [84]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(train_data, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2d487570990>

### Use the designed model to print the prediction on any one sample.

In [86]:
sample_idx = 20
sample_text = train_data[sample_idx]
prediction = model.predict(sample_text.reshape(1, -1))
print("Sample Text:", ' '.join([reverse_word_index.get(i - 3, '#') for i in sample_text]))
print("Prediction (0: Negative, 1: Positive):", prediction[0][0])

Sample Text: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # shown in australia as # this incredibly bad movie is so bad that you become # and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school # class the sets are pathetic but # better than the # and the acting is wooden br br the # # seems to have been stolen from the props # of # # there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money
Prediction (0: Negative, 1: Positive): 0.0005277

# PART B

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [18]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [19]:
len(df)

28619

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   is_sarcastic     28619 non-null  int64 
 1   headline         28619 non-null  object
 2   headline_length  28619 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 670.9+ KB


In [88]:
df['is_sarcastic'].unique()

array([1, 0], dtype=int64)

In [89]:
df.isna().sum()

is_sarcastic       0
headline           0
headline_length    0
dtype: int64

### Retain relevant columns 

In [20]:
df = df[['is_sarcastic', 'headline']]

### Get length of each sentence

In [21]:
df['headline_length'] = df['headline'].apply(lambda x: len(x.split()))

### Define parameters

In [23]:
max_words = 10000  
max_sequence_length = 30  
embedding_dim = 100 

### Get indices for words

In [24]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['headline'])
word_index = tokenizer.word_index

### Create features and labels

In [25]:
X = tokenizer.texts_to_sequences(df['headline'])
X = pad_sequences(X, maxlen=max_sequence_length)
y = df['is_sarcastic']

### Get vocabulary size

In [26]:
vocab_size = len(word_index) + 1

In [27]:
vocab_size

30885

### Create a weight matrix using GloVe embeddings

In [38]:
import zipfile

zip_path = r'C:\Users\Dell\OneDrive\glove.6B.zip'
extract_folder = r'C:\Users\Dell\OneDrive\glove.6B'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

embedding_file_path = r'C:\Users\Dell\OneDrive\glove.6B\glove.6B.100d.txt'  # or adjust to the specific file you want

In [39]:
embedding_index = {}
with open(r'C:\Users\Dell\OneDrive\glove.6B\glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Define and compile a Bidirectional LSTM model.

In [40]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Fit the model and check the validation accuracy

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2d4e099f090>

### Evaluate the model

In [44]:
_, accuracy = model.evaluate(X_test, y_test)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 85.99%
