In [0]:
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [0]:
def load_data(path):
  '''Function to load data.
     Data is present in a json file
     parameters: filename->path to the json file
  '''  
  data = []
  labels = []
  fp = open(path,'r')
  for l in fp:
    dict_ = json.loads(l)
    data.append(dict_['headline'])
    labels.append(dict_['is_sarcastic'])
  return data,labels

In [36]:
#load the data
data,labels = load_data('Sarcasm_Headlines_Dataset.json')
print('Total no. of samples:',len(data))

Total no. of samples: 26709


In [0]:
#split into training and validation sets
def split_data(split_factor,data,labels):
  '''Function to split the data into training and testing sets
     parameters:
     split_factor-> fraction of data to be kept as trainin data
     data-> list of news headlines
     labels-> corresponding labels
  '''
  m = len(data) #size of the data
  num_train = int(m*split_factor)
  training_examples = data[0:num_train]
  testing_examples = data[num_train:]
  training_labels = labels[0:num_train]
  testing_labels = labels[num_train:]
  return training_examples,training_labels,testing_examples,testing_labels

In [40]:
#split the data
split_factor = 0.9
train_x,train_y,test_x,test_y = split_data(split_factor,data,labels)
print('no. of training examples=',len(train_x))
print('no. of testing examples=',len(test_x))

no. of training examples= 24038
no. of testing examples= 2671


In [0]:
#import tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
#tokenize and pad data
vocab_size = 10000
max_length = 150
pad = 'post'
trunc = 'pre'

#generate indices for words(tokens)
tokenizer = Tokenizer(num_words=vocab_size,oov_token="<unk>")
tokenizer.fit_on_texts(train_x)
word_to_indices = tokenizer.word_index

#convert sentences to sequence of tokens
train_sequences = tokenizer.texts_to_sequences(train_x)
test_sequences = tokenizer.texts_to_sequences(test_x)

#pad the sequences so that each sequence is of same length
padded_train_sequences = pad_sequences(train_sequences,maxlen=max_length,padding = pad,truncating = trunc)
padded_test_sequences = pad_sequences(test_sequences,maxlen=max_length,padding = pad,truncating = trunc)

In [57]:
print(padded_train_sequences.shape)
print(padded_test_sequences.shape)

(24038, 150)
(2671, 150)
