In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
import spacy
import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import optimizers, layers
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, Flatten

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
nlp = spacy.load('en_core_web_sm')

In [4]:
# Get the GloVe into our directory
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [5]:
train = pd.read_csv('Datasets/twitter_training.csv', index_col=0, header=None, names=['entity', 'label', 'text'])
test = pd.read_csv('Datasets/twitter_validation.csv', index_col=0, header=None, names=['entity', 'label', 'text'])

In [6]:
def lowercase(data):
    return data['text'].str.lower()

def change_punctuation(data):
    return data['text'].str.replace('`', "'")

def remove_numbers(data):
    return data['text'].replace('[^a-zA-z.,!?/:;\"\'\\s]', '', regex=True)

def remove_special_characters(data):
    return data['text'].replace('[^a-zA-Z0-9 ]', '', regex=True)

def custom(data):
    return data['text'].replace('im', 'i am')

def lemmatize(data):
    lemmatized_array = []
    
    for text in data['text']:
        lemmatized_text = []
        doc = nlp(text)
        for token in doc:
            lemmatized_text.append(token.lemma_)
        lemmatized_array.append(' '.join(lemmatized_text))
    return lemmatized_array

def stop_words(data):
    stop_words_array = []
    for text in data['text']:
        doc = nlp(text)
        filtered_tokens = [token.text for token in doc if not token.is_stop]
        stop_words_array.append(' '.join(filtered_tokens))
    return stop_words_array

def delete_links(data):
    return data['text'].replace(r'http\S+', '', regex=True)

def preprocessing(data):
    df = data.copy()
    df['text'] = lowercase(df)
    df['text'] = custom(df)
    df['text'] = change_punctuation(df)
    df['text'] = lemmatize(df)
    df['text'] = remove_numbers(df)
    df['text'] = delete_links(df)
    df['text'] = remove_special_characters(df)
    return df

In [7]:
# As seen in dataset, the first entry itself contains many multiple words
train.drop_duplicates(subset=['text'], inplace=True)
train.reset_index(inplace=True)
train['text'] = train['text'].astype('str')
test['text'] = test['text'].astype('str')

In [8]:
len(train['text'])
len(test['text'])

1000

In [9]:
train = preprocessing(train)
test = preprocessing(test)

In [9]:
le = LabelEncoder()
train['label'] = le.fit_transform(train['label'])
test['label'] = le.transform(test['label'])

X = train['text']
y = train['label']

In [10]:
max_words = 10000
maxlen = 200
emb_dim = 50
training_samples = int(len(X) * 0.8)

text_dataset = tf.data.Dataset.from_tensor_slices(X)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
max_features = 20000
embedding_dim = 128

vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=max_words, # Max number of word in the internal dictionnary. We keep the most frequent
        output_mode='int',
        output_sequence_length=maxlen  # Size max of text
        )

vectorize_layer.adapt(text_dataset.batch(64)) 

In [12]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [13]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 8305 words (1695 misses)


In [14]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    trainable=False,
)
embedding_layer.build((1,))
embedding_layer.set_weights([embedding_matrix])

In [19]:
model = keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    embedding_layer,
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

model = Sequential()
model.add(Flatten(input_shape=(height, width, channels)))  # Replace height, width, and channels with the correct values
model.add(Dense(num_classes, activation='softmax'))  # Replace num_classes with the correct value

NameError: name 'height' is not defined

In [16]:
cl = [tf.keras.callbacks.EarlyStopping(
                  monitor='val_accuracy',
                  restore_best_weights=True,
                  patience=10)] 

history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=50, batch_size=64, callbacks = cl)

Epoch 1/50


InvalidArgumentError: Graph execution error:

Detected at node sequential_1/flatten_1/Reshape defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 639, in run_forever

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 1985, in _run_once

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 359, in execute_request

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 446, in do_execute

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\Chandanmal Bardia\AppData\Local\Temp\ipykernel_1616\2141241090.py", line 6, in <module>

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 323, in fit

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 117, in one_step_on_iterator

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 105, in one_step_on_data

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 56, in train_step

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\layers\layer.py", line 816, in __call__

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\operation.py", line 42, in __call__

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 157, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\models\sequential.py", line 203, in call

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\models\functional.py", line 188, in call

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\function.py", line 153, in _run_through_graph

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\models\functional.py", line 572, in call

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\layers\layer.py", line 816, in __call__

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 118, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\operation.py", line 42, in __call__

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\utils\traceback_utils.py", line 157, in error_handler

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\layers\reshaping\flatten.py", line 54, in call

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\ops\numpy.py", line 4499, in reshape

  File "C:\Users\Chandanmal Bardia\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 1378, in reshape

Only one input size may be -1, not both 0 and 1
	 [[{{node sequential_1/flatten_1/Reshape}}]] [Op:__inference_one_step_on_iterator_48098]

In [None]:
history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['loss', 'val_loss']].plot()
history_frame.loc[:, ['accuracy', 'val_accuracy']].plot()

In [None]:
predictions = model.predict(test['text'])

In [None]:
predicted_labels = []

for predictions_array in predictions:
    predicted_labels.append(np.argmax(predictions_array))

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(predicted_labels, test['label'])

In [None]:
print(predicted_labels)

In [None]:
print(test['label'])