In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import os
import json
import glob
import re
import nltk
import sklearn
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Layer
from tensorflow.keras.models import Model, load_model, model_from_json
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn import datasets
from pathlib import Path

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
os.chdir('/gdrive/My Drive/drivebuddy_text_classification')

with open('dataset/pavbhaji.json', 'r') as f:
    data = json.load(f)

In [None]:
# indexing json data with filename
indexed_data = {d['display_url'].split('/')[-1]: d['edge_media_to_caption']['edges'][0]['node']['text'] for d in data if d['edge_media_to_caption']['edges']}

In [None]:
file_names0_set = set([f.split('/')[-1] for f in glob.glob('dataset_mod/images/0/*.jpg')])
file_names1_set = set([f.split('/')[-1] for f in glob.glob('dataset_mod/images/1/*.jpg')])

In [None]:
# dataframe with columns (filename, text, label)
data_with_labels = pd.DataFrame([{'name': name, 'text': indexed_data[name], 'label': 1 if name in file_names1_set else 0} for name in file_names0_set | file_names1_set])

In [None]:
data_with_labels.head()

Unnamed: 0,name,text,label
0,39392437_1855433301206753_3077607271769833472_...,Spring Rolls at @thefernsurya .\n.\n.\nChicken...,0
1,39399778_319114562170341_3160868167740293120_n...,#fremonttroll #saturdayfun #homemade #pavbhaji,0
2,39400565_2145822319073330_7701035855338536960_...,#sunday#rakshabandhan#family#gettogether#foodm...,1
3,39175550_324648548281295_6429935512378671104_n...,#pakora #vadapav #chaat #paneer #delhifood #ka...,0
4,39294744_1228134780656766_2536114897649926144_...,"Utterly delicious ""PavBhaji"". If you crave for...",1


In [None]:
# preprocessing function

def filter_query(query):
    query = query.lower()
    query = re.sub(r'[@][^\s]+', '', query) #remove @mentions
    query = re.sub(r'pav.{0,3}bhaji', ' pavbhaji ', query) #replace all variations of pavbhaji with "pavbhaji"
    query = re.sub(r'[\!-\/\:-\@]+', ' ', query) 
    query = re.sub('[^A-Za-z0-9\s]+', ' ', query) #remove all characters except alphanumeric
    query = re.sub(r'[\t\n\r\f ]+', ' ', re.sub(r'\.', '. ', query)) #replace multiple spaces with a single space
    query = ' '.join([w for w in query.split() if w not in stopwords.words('english')]) #remove stopwords
    
    # print (query)
    # doc = nlp(query)
    # tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # filt_q = ' '.join(tokens)
    filt_q = re.sub(r'\b(n\'t|nt)\b', 'not', query)
    filt_q = re.sub(r'\'ll\b', 'will', filt_q)
    return filt_q

In [None]:
processed_text = data_with_labels['text'].map(filter_query)
df = pd.DataFrame({'name':data_with_labels['name'], 'text':processed_text, 'label': data_with_labels['label']})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2)

In [None]:
category_counts = len(np.unique(df.label))
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
embed = hub.load(module_url)
embed.graph_debug_info
# embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value







In [None]:
class LearningRateTracker(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(" - lr: {}".format(K.eval(self.model.optimizer.lr))) 

In [None]:
# build a model with tf_hub universal sentence encoder followed by dense network

def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)))

input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(UniversalEmbedding, output_shape=(512,))(input_text)
dense1 = Dense(1024, activation='relu')(embedding)
dense2 = Dense(128, activation='relu')(dense1)
out = Dense(1, activation='sigmoid')(dense2)

model = Model(inputs=[input_text], outputs=out)

LEARNING_RATE = 0.001

optimizer = Adam(lr=LEARNING_RATE)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])









The following Variables were used a Lambda layer's call (lambda_11), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

The following Variables were used a Lambda layer's call (lambda_11), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

In [None]:
# train model

LR_PATIENCE = 10
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=LR_PATIENCE, min_lr=1e-8, verbose=1, mode="min")
es_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
lr_tracker = LearningRateTracker()

history = model.fit(X_train, 
          y_train,
          validation_data=(X_test, y_test),
          epochs=1000,
          batch_size=256,
          callbacks=[es_callback, lr_tracker, reduce_lr])

# achieves maximum accuracy of 74.7% on validation

Epoch 1/1000
 - lr: 0.0010000000474974513
Epoch 2/1000
 - lr: 0.0010000000474974513
Epoch 3/1000
 - lr: 0.0010000000474974513
Epoch 4/1000
 - lr: 0.0010000000474974513
Epoch 5/1000
 - lr: 0.0010000000474974513
Epoch 6/1000
 - lr: 0.0010000000474974513
Epoch 7/1000
 - lr: 0.0010000000474974513
Epoch 8/1000
 - lr: 0.0010000000474974513
Epoch 9/1000
 - lr: 0.0010000000474974513
Epoch 10/1000
 - lr: 0.0010000000474974513
Epoch 11/1000
 - lr: 0.0010000000474974513
Epoch 12/1000
 - lr: 0.0010000000474974513
Epoch 13/1000
 - lr: 0.0010000000474974513
Epoch 14/1000
 - lr: 0.0010000000474974513
Epoch 15/1000
 - lr: 0.0010000000474974513
Epoch 16/1000
 - lr: 0.0010000000474974513
Epoch 17/1000
 - lr: 0.0010000000474974513
Epoch 18/1000
 - lr: 0.0010000000474974513
Epoch 19/1000
 - lr: 0.0010000000474974513
Epoch 20/1000
 - lr: 0.0010000000474974513
Epoch 21/1000
 - lr: 0.0010000000474974513

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 22/1000
 - lr: 0.00

In [None]:
# variation with lesser units in dense network

def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)))

input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(UniversalEmbedding, output_shape=(512,))(input_text)
dense1 = Dense(256, activation='relu')(embedding)
dense2 = Dense(128, activation='relu')(dense1)
out = Dense(1, activation='sigmoid')(dense2)

model = Model(inputs=[input_text], outputs=out)

LEARNING_RATE = 0.001

optimizer = Adam(lr=LEARNING_RATE)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

The following Variables were used a Lambda layer's call (lambda_13), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

The following Variables were used a Lambda layer's call (lambda_13), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

In [None]:
LR_PATIENCE = 10
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=LR_PATIENCE, min_lr=1e-8, verbose=1, mode="min")
es_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
lr_tracker = LearningRateTracker()

history = model.fit(X_train, 
          y_train,
          validation_data=(X_test, y_test),
          epochs=1000,
          batch_size=256,
          callbacks=[es_callback, lr_tracker, reduce_lr])

# achieves maximum accuracy of 76.92% on validation set

Epoch 1/1000
 - lr: 0.0010000000474974513
Epoch 2/1000
 - lr: 0.0010000000474974513
Epoch 3/1000
 - lr: 0.0010000000474974513
Epoch 4/1000
 - lr: 0.0010000000474974513
Epoch 5/1000
 - lr: 0.0010000000474974513
Epoch 6/1000
 - lr: 0.0010000000474974513
Epoch 7/1000
 - lr: 0.0010000000474974513
Epoch 8/1000
 - lr: 0.0010000000474974513
Epoch 9/1000
 - lr: 0.0010000000474974513
Epoch 10/1000
 - lr: 0.0010000000474974513
Epoch 11/1000
 - lr: 0.0010000000474974513
Epoch 12/1000
 - lr: 0.0010000000474974513
Epoch 13/1000
 - lr: 0.0010000000474974513
Epoch 14/1000
 - lr: 0.0010000000474974513
Epoch 15/1000
 - lr: 0.0010000000474974513
Epoch 16/1000
 - lr: 0.0010000000474974513
Epoch 17/1000
 - lr: 0.0010000000474974513
Epoch 18/1000
 - lr: 0.0010000000474974513
Epoch 19/1000
 - lr: 0.0010000000474974513
Epoch 20/1000
 - lr: 0.0010000000474974513
Epoch 21/1000
 - lr: 0.0010000000474974513
Epoch 22/1000
 - lr: 0.0010000000474974513
Epoch 23/1000
 - lr: 0.0010000000474974513
Epoch 24/1000
 - lr:

In [None]:
# another variation
input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(UniversalEmbedding, output_shape=(512,))(input_text)
dense1 = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(embedding)
dense2 = Dense(16, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(dense1)
out = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4))(dense2)

model = Model(inputs=[input_text], outputs=out)

LEARNING_RATE = 0.001

optimizer = Adam(lr=LEARNING_RATE)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

The following Variables were used a Lambda layer's call (lambda_15), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

The following Variables were used a Lambda layer's call (lambda_15), but
are not present in its tracked objects:
  <tf.Variable 'Embeddings/sharded_0:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_1:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_2:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_3:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_4:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_5:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_6:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_7:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_8:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_9:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_10:0' shape=(26667, 320) dtype=float32>
  <tf.Variable 'Embeddings/sharded_11:0' shape=(26667, 320) dtype=float3

In [None]:
LR_PATIENCE = 10
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=LR_PATIENCE, min_lr=1e-8, verbose=1, mode="min")
es_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
lr_tracker = LearningRateTracker()

history = model.fit(X_train, 
          y_train,
          validation_data=(X_test, y_test),
          epochs=1000,
          batch_size=256,
          callbacks=[es_callback, lr_tracker, reduce_lr])

# achieves accuracy of 70.33% on validation set

Epoch 1/1000
 - lr: 0.0010000000474974513
Epoch 2/1000
 - lr: 0.0010000000474974513
Epoch 3/1000
 - lr: 0.0010000000474974513
Epoch 4/1000
 - lr: 0.0010000000474974513
Epoch 5/1000
 - lr: 0.0010000000474974513
Epoch 6/1000
 - lr: 0.0010000000474974513
Epoch 7/1000
 - lr: 0.0010000000474974513
Epoch 8/1000
 - lr: 0.0010000000474974513
Epoch 9/1000
 - lr: 0.0010000000474974513
Epoch 10/1000
 - lr: 0.0010000000474974513
Epoch 11/1000
 - lr: 0.0010000000474974513
Epoch 12/1000
 - lr: 0.0010000000474974513
Epoch 13/1000
 - lr: 0.0010000000474974513
Epoch 14/1000
 - lr: 0.0010000000474974513
Epoch 15/1000
 - lr: 0.0010000000474974513
Epoch 16/1000
 - lr: 0.0010000000474974513
Epoch 17/1000
 - lr: 0.0010000000474974513
Epoch 18/1000
 - lr: 0.0010000000474974513
Epoch 19/1000
 - lr: 0.0010000000474974513
Epoch 20/1000
 - lr: 0.0010000000474974513
Epoch 21/1000
 - lr: 0.0010000000474974513
Epoch 22/1000
 - lr: 0.0010000000474974513
Epoch 23/1000
 - lr: 0.0010000000474974513
Epoch 24/1000
 - lr:

In [None]:
model_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
# hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
hub_layer = hub.KerasLayer(model_url, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)
# hub_layer(train_examples[:3])

model_nnlm = tf.keras.Sequential()
model_nnlm.add(hub_layer)
model_nnlm.add(tf.keras.layers.Dense(128, activation='relu'))
model_nnlm.add(tf.keras.layers.Dense(64, activation='relu'))
model_nnlm.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model_nnlm.summary()

LEARNING_RATE = 0.001

optimizer = Adam(lr=LEARNING_RATE)

model_nnlm.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['binary_accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 124,667,521
Trainable params: 124,667,521
Non-trainable params: 0
_________________________________________________________________


In [None]:
LR_PATIENCE = 10
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=LR_PATIENCE, min_lr=1e-8, verbose=1, mode="min")
es_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
lr_tracker = LearningRateTracker()

history = model_nnlm.fit(X_train, 
          y_train,
          validation_data=(X_test, y_test),
          epochs=1000,
          batch_size=256,
          callbacks=[es_callback, lr_tracker, reduce_lr])
model_nnlm.save_weights('./model_cust_nnlm.h5')

# achieves accuracy of 71.43%

Epoch 1/1000
 - lr: 0.0010000000474974513
Epoch 2/1000
 - lr: 0.0010000000474974513
Epoch 3/1000
 - lr: 0.0010000000474974513
Epoch 4/1000
 - lr: 0.0010000000474974513
Epoch 5/1000
 - lr: 0.0010000000474974513
Epoch 6/1000
 - lr: 0.0010000000474974513
Epoch 7/1000
 - lr: 0.0010000000474974513
Epoch 8/1000
 - lr: 0.0010000000474974513
Epoch 9/1000
 - lr: 0.0010000000474974513
Epoch 10/1000
 - lr: 0.0010000000474974513
Epoch 11/1000
 - lr: 0.0010000000474974513
Epoch 12/1000
 - lr: 0.0010000000474974513
Epoch 13/1000
 - lr: 0.0010000000474974513
Epoch 14/1000
 - lr: 0.0010000000474974513
Epoch 15/1000
 - lr: 0.0010000000474974513
Epoch 16/1000
 - lr: 0.0010000000474974513
Epoch 17/1000
 - lr: 0.0010000000474974513
Epoch 18/1000
 - lr: 0.0010000000474974513
Epoch 19/1000
 - lr: 0.0010000000474974513
Epoch 20/1000
 - lr: 0.0010000000474974513
Epoch 21/1000
 - lr: 0.0010000000474974513
Epoch 22/1000
 - lr: 0.0010000000474974513

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.000