In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization, LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
df = pd.read_csv('final.csv')

In [4]:
X = df['comment_text'].astype(str) 
y = df[df.columns[1:]].values

In [5]:
MAX_FEATURES = 20000
SEQUENCE_LENGTH = 300
BATCH_SIZE = 120
EPOCHS = 5

In [None]:
# vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
#                                output_sequence_length=SEQUENCE_LENGTH,
#                                output_mode='int')
# vectorizer.adapt(X.values)
# vectorized_text = vectorizer(X.values)


In [6]:

word_counts = {}
for sentence in X.values:
    words = sentence.lower().split()
    for word in words:
        if word not in word_counts:
            word_counts[word] = 1
        else:
            word_counts[word] += 1


In [7]:
sorted_words = sorted(word_counts, key=lambda x: word_counts[x], reverse=True)
sorted_words = sorted_words[:MAX_FEATURES]

In [8]:
word_to_index = {word: index + 1 for index, word in enumerate(sorted_words)}

In [9]:
sequences = []
for sentence in X.values:
    words = sentence.lower().split()
    sequence = []
    for word in words:
        if word in word_to_index:
            sequence.append(word_to_index[word])
    sequences.append(sequence)

In [10]:
padded_sequences = []
for sequence in sequences:
    if len(sequence) > SEQUENCE_LENGTH:
        sequence = sequence[:SEQUENCE_LENGTH]
    else:
        sequence = sequence + [0] * (SEQUENCE_LENGTH - len(sequence))
    padded_sequences.append(sequence)

In [11]:
vectorized_text = np.array(padded_sequences)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(15000)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [14]:
model = tf.keras.Sequential([
    Embedding(MAX_FEATURES + 1, 128),
    Bidirectional(LSTM(256, activation='tanh', return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(128, activation='tanh')),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(5, activation='sigmoid')
])

In [15]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
history = model.fit(train, epochs=EPOCHS, validation_data=val, callbacks=[early_stopping])

Epoch 1/5
 23/187 [==>...........................] - ETA: 1:54:04 - loss: nan - accuracy: 0.9275

KeyboardInterrupt: 

In [None]:
model.save('romanizedd.h5')

In [34]:
batch_X, batch_y = test.as_numpy_iterator().next()


In [35]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0,

In [36]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.9285714030265808, Recall:0.9437037110328674, Accuracy:0.02222222276031971


In [None]:
pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.32.0-py3-none-any.whl (19.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting aiohttp (from gradio)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from gradio)
  Downloading fastapi-0.95.2-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
def vectorizer(comment):
  

# Tokenize the comment and create a vocabulary
  word_counts = {}
  words = comment.lower().split()
  for word in words:
    if word not in word_counts:
        word_counts[word] = 1
    else:
        word_counts[word] += 1

# Sort the words by frequency and keep the most frequent ones
  sorted_words = sorted(word_counts, key=lambda x: word_counts[x], reverse=True)
  sorted_words = sorted_words[:MAX_FEATURES]

# Create a word-to-index mapping
  word_to_index = {word: index + 1 for index, word in enumerate(sorted_words)}

# Convert the comment to a sequence of integers
  sequence = []
  for word in words:
      if word in word_to_index:
          sequence.append(word_to_index[word])

  # Pad the sequence to a fixed length
  if len(sequence) > SEQUENCE_LENGTH:
      sequence = sequence[:SEQUENCE_LENGTH]
  else:
      sequence = sequence + [0] * (SEQUENCE_LENGTH - len(sequence))

  # Convert to a numpy array
  vectorized_comment = np.array([sequence])
  return vectorized_comment

In [None]:
def score_comment(comment):
    # vectorized_comment = vectorizer([comment])
    vectorized_comment = vectorizer(comment)
    
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[1:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

  super().__init__(
  super().__init__(


In [None]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2b8220e0907018d7b4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


