<a href="https://colab.research.google.com/github/ameernayman/Sensitive_image_classification/blob/main/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification to detect sensitive data exposure

## Importing libraries and downloading the dataset

In [9]:
import tensorflow as tf
import json
import os
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:



text_folder = "/content/Sensitive_image_classification"
if os.path.exists(text_folder) == False:
  !git clone https://github.com/ameernayman/Sensitive_image_classification.git

dataset_text = text_folder + "/dataset_text/"
sensitive_json = "data_sensitive.json"
nonsensitive_json = "data_nonsensitive.json"

In [11]:

size_vocab = 3000
dimensions = 32
length_text = 60
type_trunc='post'
padding='post'
out_of_vocb = "<OOV>"
size_training = 25000

## Preprocessing the dataset

In [12]:
textData = []
textSentences = []
textLabels = []

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def load_data(filename):
  with open(dataset_text + filename, 'r') as f:
      data_store = json.load(f)
  for value in data_store:
    textSentences = value['data']
    textLabels = value['is_sensitive']
    for wrd in stopwords: 
      token = " " + wrd + " "
      textSentences = textSentences.replace(token, " ")
    textData.append([textSentences, textLabels])


load_data(sensitive_json)
load_data(nonsensitive_json)

random.shuffle(textData)


for item in textData:
  textSentences.append(item[0])
  textLabels.append(item[1])


training_snt_text= textSentences[0:size_training]
validation_snt_text = textSentences[size_training:]
training_lbls_text = textLabels[0:size_training]
validation_lbls_text = textLabels[size_training:]

print("Size of Training Data set is: ", len(training_snt_text))
print("Training Data Sample:", training_snt_text[0])
print("Size of Validation Data set is: ", len(validation_snt_text))
print("Validation Sample:", validation_snt_text[0])

Size of Training Data set is:  25000
Training Data Sample: A convicted car thief diehard Chicago Cubs fan, Jimmy Dworski (Belushi) wins tickets World Series. Unfortunately, still couple days left serve prison warden (Héctor Elizondo) will not let leave come back. With help inmates, Jimmy stages riot can sneak prison see game. On way, finds Filofax uptight spineless advertising executive Spencer Barnes (Grodin), promises reward found.
Over next day, Jimmy takes Barnes' identity—staying Malibu beach house Spencer's boss, flirting boss's daughter, even taking meeting powerful Japanese food company magnate named Sakamoto (Mako Iwamatsu). The fake "Spencer"'s unorthodox methods, beating magnate tennis telling poor quality food products, gets attention taken aback Sakamoto. However unconventional negotiations food company insult executives, seemingly ruining Spencer's reputation. Meanwhile, lacking precious Filofax, real Spencer Barnes spiraling gutter. Losing clothes, car money, rely old hi

In [19]:

tknizer = Tokenizer(num_words=size_vocab, oov_token=out_of_vocb)

tknizer.fit_on_texts(training_snt_text)

index_words = tknizer.word_index
print("Size of word index:", len(index_words))

with open("word_index.json", "w") as outfile:  
    json.dump(index_words, outfile)
    print("Saving the word index as JSON")


training_squnce = tknizer.texts_to_sequences(training_snt_text)
padding_Tdataset = pad_sequences(training_squnce, maxlen=length_text, padding=padding, truncating=type_trunc)

# Apply the same for validation data
validation_squnce = tknizer.texts_to_sequences(validation_snt_text)
padding_Vdataset = pad_sequences(validation_squnce, maxlen=length_text, padding=padding, truncating=type_trunc)

Size of word index: 116055
Saving the word index as JSON


In [16]:
# Convert to Numpy arrays, so as to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_lbls_text)
validation_padded = np.array(validation_padded)
validation_labels = np.array(validation_lbls_text)

## Model

In [17]:
# Callbacks to cancel training after reaching a desired accuracy
# This is done to avoid overfitting
DESIRED_ACCURACY = 0.999
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if logs.get('accuracy') > DESIRED_ACCURACY:
      print("Reached 99.9% accuracy so cancelling training!")
      self.model.stop_training = True

callbacks = myCallback()

# Sequential - defines a SEQUENCE of layers in the neural network.
model = tf.keras.Sequential([
    # Embedding - Turns positive integers (indexes) into dense vectors of fixed size (here dimensions = 32).
    tf.keras.layers.Embedding(size_vocab, dimensions, input_length=length_text),
    # 1D convolution layer - filter size = 128, convolution window = 5, activation fn = ReLU
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    # Global average pooling operation (Flattening)
    tf.keras.layers.GlobalAveragePooling1D(),
    # Regular densely-connected Neural Network layer with ReLU activation function.
    tf.keras.layers.Dense(24, activation='relu'),
    # Regular densely-connected Neural Network layer with sigmoid activation function.
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# model.compile - Configures the model for training.
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# Adam -  optimization algorithm used instead of the classical stochastic gradient descent procedure to update network weights.

# Display the summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 32)            96000     
                                                                 
 conv1d (Conv1D)             (None, 56, 64)            10304     
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 107,889
Trainable params: 107,889
Non-trainable params: 0
__________________________________________________

## Training

In [18]:
num_epochs = 15

# model.fit - Train the model for a fixed number of epochs
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(
                        validation_padded, 
                        validation_labels), 
                    verbose=1)
                    #callbacks=[callbacks])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Plotting Accuracy and Loss Functions

In [None]:
import matplotlib.pyplot as plt

# Plot the accuracy and loss functions
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

## Confusion Matrix

In [None]:
import seaborn
print('Confusion Matrix')
y_predicted = model.predict(validation_padded)
y_predicted_labels = y_predicted > 0.5

size = np.size(y_predicted_labels)
y_predicted_labels = y_predicted_labels.reshape(size, )

for i in range (1, 5):
  total = i * size // 4
  cm = tf.math.confusion_matrix(labels=validation_labels[0:total],predictions=y_predicted_labels[0:total])

  # Calculate accuracy
  cm_np = cm.numpy()
  conf_acc = (cm_np[0, 0] + cm_np[1, 1])/ np.sum(cm_np) * 100
  print("Accuracy for", str(total), "Test Data = ", conf_acc)

  # Plot the confusion matrix
  plt.figure(figsize = (10,7))
  seaborn.heatmap(cm, annot=True, fmt='d')
  plt.title("Confusion Matrix for " + str(total) + " Test Data")
  plt.xlabel('Predicted')
  plt.ylabel('Expected')

## Saving the model

In [None]:
# Save and convert the model (Used for deploying in web application)
model.save('model/text_model.h5')
print("Saved the model successfully")

!apt-get -qq install virtualenv
!virtualenv -p python3 venv
!source venv/bin/activate
!pip install -q tensorflowjs
!tensorflowjs_converter --input_format=keras /content/model/text_model.h5 /content/text_model
print("Model converted to JSON successfully")

## Sample Example

In [None]:
# Sample examples
sentence = ["My credit card no is 124345346", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=length_text, padding=padding_type, truncating=type_trunc)
predictions = model.predict(padded)
print("OUPUT for text model")
for i in range(len(predictions)):
  print(predictions[i][0])
  if predictions[i][0]>0.5:
    print("Sensitive - "+ sentence[i])
  else:
    print("Non-Sensitive - " + sentence[i] )