<a href="https://colab.research.google.com/github/azure531/Demo/blob/main/lstm_train_try_acc88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import re
import tqdm
import numpy as np
from tqdm import tqdm_notebook

from nltk.corpus import stopwords

from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import pandas as pd
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [37]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [38]:
train = pd.read_csv('train_dataset.csv' )  # Change ';' if needed

test = pd.read_csv('test_dataset.csv')  # Change ';' if needed

In [39]:
labels = ['offensive','non_offensive']
y = train[labels].values
comments_train = train['tweet']
comments_test = test['tweet']

In [40]:
comments_train = list(comments_train)

In [41]:
def clean_text(text, remove_stopwords = True):
    output = ""
    text = str(text).replace("\n", "")
    text = re.sub(r'[^\w\s]','',text).lower()
    if remove_stopwords:
        text = text.split(" ")
        for word in text:
            if word not in stopwords.words("english"):
                output = output + " " + word
    else:
        output = text
    return str(output.strip())[1:-3].replace("  ", " ")

In [16]:
texts = []
import nltk
nltk.download('stopwords')
for line in tqdm.notebook.tqdm(comments_train, total=27942):
    texts.append(clean_text(line))
print("hello")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/27942 [00:00<?, ?it/s]

hello


In [42]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

In [43]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 30463


In [44]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 30463


In [45]:
data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (27942, 200)
Shape of label tensor: (27942, 2)


In [46]:
import numpy as np
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]

In [47]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]

In [48]:
import numpy as np

embeddings_index = {}
with open('glove.txt', 'r', encoding='utf-8') as f:
    print('Loading GloVe from: glove.txt...', end='')
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
print("Done.\n Proceeding with Embedding Matrix...", end="")
f.close()

# Determine the correct dimension size
#first_word_embedding = next(iter(embeddings_index.values()))
#EMBEDDING_DIM = len(first_word_embedding)

# Example word index
#word_index = {'example': 1, 'word': 2}

'''embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and len(embedding_vector) == EMBEDDING_DIM:
        embedding_matrix[i] = embedding_vector

print(" Completed!")'''
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(" Completed!")


Loading GloVe from: glove.txt...Done.
 Proceeding with Embedding Matrix... Completed!


In [49]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding((len(word_index) + 1),
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable=False,
                           name='embeddings')
embedded_sequences = embedding_layer(sequence_input)




In [50]:
x = LSTM(60, return_sequences=True,name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
preds = Dense(2, activation="sigmoid")(x)

In [51]:
model = Model(sequence_input, preds)
model.compile(loss = 'binary_crossentropy',
             optimizer='adam',
             metrics = ['accuracy'])
model.summary()

In [52]:
print(x_train.dtype)
print(y_train.dtype)

int32
object


In [53]:
import numpy as np

# Convert y_train and y_val to 1D arrays
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)

# Now, you can apply Label Encoding as before:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_val = encoder.transform(y_val)

In [54]:
from tensorflow.keras.utils import to_categorical

# Assuming your labels are in a column called 'label'
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [57]:
print('Training progress:')
history = model.fit(x_train, y_train, epochs=4, batch_size=32, validation_data=(x_val, y_val))

Training progress:
Epoch 1/4
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 125ms/step - accuracy: 0.8875 - loss: 0.2643 - val_accuracy: 0.8788 - val_loss: 0.2653
Epoch 2/4
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 125ms/step - accuracy: 0.9003 - loss: 0.2326 - val_accuracy: 0.8806 - val_loss: 0.2648
Epoch 3/4
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 126ms/step - accuracy: 0.9058 - loss: 0.2166 - val_accuracy: 0.8842 - val_loss: 0.2621
Epoch 4/4
[1m699/699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 145ms/step - accuracy: 0.9149 - loss: 0.1973 - val_accuracy: 0.8844 - val_loss: 0.2717


In [58]:
model.save('my_model.keras')

In [72]:
import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('my_model.keras')

# Load the tokenizer (if you saved it separately)
# tokenizer = ...  # (load your tokenizer if needed)

# Assume tokenizer is already defined in your previous code

# Function to clean text (same as before)
'''def clean_text(text, remove_stopwords=True):
    # ... (your existing clean_text function) ...'''

# Function to predict toxicity (modified to use the loaded model)
def predict_toxicity(text, model, tokenizer):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, padding='post', maxlen=MAX_SEQUENCE_LENGTH)  # Assuming MAX_SEQUENCE_LENGTH is defined
    prediction = model.predict(padded_sequence)[0]
    return prediction

# Get user input
user_input = input("Enter the text you want to analyze: ")

# Predict toxicity labels
prediction = predict_toxicity(user_input, model, tokenizer)

# Print the results with labels
labels = ['offensive','non_offensive']
for i, label in enumerate(labels):
    print(f"{label}: {prediction[i]:.4f}")

Enter the text you want to analyze: black people dieeeeeeee!1111
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step
offensive: 0.4275
non_offensive: 0.5769


In [73]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ... (your existing code for training and validation) ...

# Get predictions on the validation set
y_pred = model.predict(x_val)

# Convert predictions to binary (0 or 1) based on a threshold (e.g., 0.5)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred_binary)
precision = precision_score(y_val, y_pred_binary, average='weighted')  # or 'micro', 'macro', None
recall = recall_score(y_val, y_pred_binary, average='weighted')  # or 'micro', 'macro', None
f1 = f1_score(y_val, y_pred_binary, average='weighted')  # or 'micro', 'macro', None

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step
Accuracy: 0.8819
Precision: 0.8892
Recall: 0.8831
F1-score: 0.8838
