## Install Packages

In [1]:
!pip install datasets gensim

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

## Importing Libraries and Load a pre-trained Word2Vec model

In [2]:
from datasets import load_dataset
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
import spacy
import numpy as np
import string
from keras.layers import *
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from keras.models import Sequential
import tensorflow as tf
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300")



## Load Dataset from the Hugging Face Hub

In [3]:
dataset_name = "SetFit/tweet_sentiment_extraction"
dataset = load_dataset(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/94.0 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/503k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27481 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3534 [00:00<?, ? examples/s]

## Dataset Structure



In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

## We will use only the following columns:
*  text: The content of the text.
*  label: The classification label.

In [5]:
x = dataset['train']['text']
y = dataset['train']['label']
X_test = dataset['test']['text']
y_test = dataset['test']['label']

## text data

In [6]:
x[:4]

[' I`d have responded, if I were going',
 ' Sooo SAD I will miss you here in San Diego!!!',
 'my boss is bullying me...',
 ' what interview! leave me alone']

## label

In [7]:
y[:4]

[1, 0, 0, 0]

## Download Some Packages for Preprocessing

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Preprocessing

In [9]:
def textPreprocessing(text):

  # Noramlization
  text = text.lower()
  text = re.sub(r'\n', '', text) # Remove new lines (\n)
  text = re.sub(r'@\w+', '', text)
  text = re.sub(r'\r', '', text)
  text = re.sub(r'http\S+', '', text)
  # Remove Punctuation
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)
  # Tokenization
  text = text.split()
  # Removing Stop Words
  useless_words = nltk.corpus.stopwords.words("english")
  useless_words = useless_words + ['«', '»','©','@', '#', 'http', 'www', '/']
  text = [word for word in text if not word in useless_words]

  # # Cleaning
  # Removing Numbers and words with Numbers
  text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text]   # Remove words with numbers
  text_filtered = [re.sub(r'\d', '', w) for w in text_filtered] # Remove standalone numbers
  text_filtered = [re.sub(r'\s+[a-zA-Z]\s+', '', w) for w in text_filtered ]

  text_filtered = [lemmatizer.lemmatize(word) for word in text_filtered]  # Lemmatization

  # Removing empty strings
  text_filtered = [x for x in text_filtered if x != '']
  text_lem = list(text_filtered)

  return ' '.join(text_lem)

## Apply Preprocessing

In [10]:
Preprocessed_Text=[textPreprocessing(texts) for texts in x]
X_test=[textPreprocessing(texts) for texts in X_test]

In [11]:
len(Preprocessed_Text)

27481

## Data Splitting


*   Train
*   Test
*   Validation


In [12]:
X_train, X_val, y_train, y_val = train_test_split(Preprocessed_Text,y, stratify=y, test_size=0.13,random_state=1)

In [13]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

23908
3573
3534


## convert the data to numbers and put padding to make all have the same size

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
val_sequences = tokenizer.texts_to_sequences(X_val)

trainx = pad_sequences(train_sequences)
testx = pad_sequences(test_sequences,maxlen = trainx.shape[1])
valx = pad_sequences(val_sequences,maxlen = trainx.shape[1])

In [15]:
trainx.shape

(23908, 23)

In [16]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

22379

In [17]:
mx = trainx.shape[1]

In [18]:
embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    try:
        embedding_vector = word2vec_model[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        # Words not found in the Word2Vec model will be all-zeros
        embedding_matrix[i] = np.zeros(embedding_dim)

## These lines convert label vectors for training, testing, and validation into a one-hot encoded format.

In [19]:
y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)
y_val = to_categorical(y_val, num_classes=3)

## Set up EarlyStopping to stop training when validation loss doesn't improve

In [20]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

## In the following sections, I will employ various techniques, including:


*   1D Convolutional Neural Networks (CNN)

*   Simple Recurrent Neural Networks (RNN)
*   Gated Recurrent Units (GRU)


*   Long Short-Term Memory Networks (LSTM)
*   Random Forest Classifier



## 1D Convolutional Neural Networks (CNN)



In [21]:
def cnn11(text,vocab_size,mx):
    model=Sequential()
    input_shape=Input((mx,))
    model=Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_shape)
    model=Dropout(0.5)(model)
    model= Conv1D(128, 7, activation='relu')(model)
    model= Conv1D(128, 7, activation='relu')(model)
    model=GlobalMaxPooling1D()(model)
    model=Dense(512, activation='relu')(model)
    model=Dropout(0.5)(model)
    model= Dense(3, activation='softmax')(model)
    model = Model(input_shape, model)
    print(model.summary())
    return model

In [22]:
modl=cnn11(trainx,vocab_size,mx)
modl.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modl.fit(trainx, y_train, validation_data=(valx, y_val), batch_size=64, epochs=20 , callbacks=[early_stopping])
loss, accuracy = modl.evaluate(testx, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 23)]              0         
                                                                 
 embedding (Embedding)       (None, 23, 300)           6713700   
                                                                 
 dropout (Dropout)           (None, 23, 300)           0         
                                                                 
 conv1d (Conv1D)             (None, 17, 128)           268928    
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 128)           114816    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                             

## Simple Recurrent Neural Networks (RNN)

In [23]:
def simplernn(text,vocab_size,mx):
    input_shape=Input((mx,))
    model=Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_shape)
    model=Dropout(0.2)(model)
    model= SimpleRNN(32, activation='relu', return_sequences=True)(model)
    model=GlobalMaxPooling1D()(model)
    model= Dense(3, activation='softmax')(model)
    model = Model(input_shape, model)
    print(model.summary())
    return model

In [24]:
modl=simplernn(trainx,vocab_size,mx)
modl.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modl.fit(trainx, y_train, validation_data=(valx, y_val), batch_size=64, epochs=20 , callbacks=[early_stopping])
loss, accuracy = modl.evaluate(testx, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 23)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 23, 300)           6713700   
                                                                 
 dropout_2 (Dropout)         (None, 23, 300)           0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 23, 32)            10656     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 32)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 3)                 99        
                                                           

## Gated Recurrent Units (GRU)

In [25]:
def GRU_model(text,vocab_size,mx):
    input_shape=Input((mx,))
    model=Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_shape)
    model=Dropout(0.2)(model)
    model= GRU(64, activation='relu', return_sequences=True)(model)
    model=GlobalMaxPooling1D()(model)
    model= Dense(3, activation='softmax')(model)
    model = Model(input_shape, model)
    print(model.summary())
    return model

In [26]:
modl=GRU_model(trainx,vocab_size,mx)
modl.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modl.fit(trainx, y_train, validation_data=(valx, y_val), batch_size=64, epochs=20 , callbacks=[early_stopping])

loss, accuracy = modl.evaluate(trainx, y_train)
print("Train Loss:", loss)
print("Train Accuracy:", accuracy)

loss, accuracy = modl.evaluate(testx, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

loss, accuracy = modl.evaluate(valx, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)




Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 23)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 23, 300)           6713700   
                                                                 
 dropout_3 (Dropout)         (None, 23, 300)           0         
                                                                 
 gru (GRU)                   (None, 23, 64)            70272     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_3 (Dense)             (None, 3)                 195       
                                                           

## Long Short-Term Memory Networks (LSTM)

In [32]:
def lstm_model(text,vocab_size,mx):
    input_shape=Input((mx,))
    model=Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_shape)
    model=Dropout(0.2)(model)
    model= LSTM(64, activation='relu', return_sequences=True)(model)
    model=GlobalMaxPooling1D()(model)
    model= Dense(3, activation='softmax')(model)
    model = Model(input_shape, model)
    print(model.summary())
    return model

In [33]:
modl=lstm_model(trainx,vocab_size,mx)
modl.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modl.fit(trainx, y_train, validation_data=(valx, y_val), batch_size=64, epochs=20 , callbacks=[early_stopping])

loss, accuracy = modl.evaluate(trainx, y_train)
print("Train Loss:", loss)
print("Train Accuracy:", accuracy)

loss, accuracy = modl.evaluate(testx, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

loss, accuracy = modl.evaluate(valx, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)




Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 23)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 23, 300)           6713700   
                                                                 
 dropout_6 (Dropout)         (None, 23, 300)           0         
                                                                 
 lstm_2 (LSTM)               (None, 23, 64)            93440     
                                                                 
 global_max_pooling1d_5 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_6 (Dense)             (None, 3)                 195       
                                                           

## Random Forest Classifier with TFIDF to convert the text data into a numerical format.

In [36]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(Preprocessed_Text,y, stratify=y, test_size=0.2,random_state=0)
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train1).toarray()
X_test_tfidf = vectorizer.transform(X_test1).toarray()

In [37]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train1)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Evaluate accuracy
accuracy = accuracy_score(y_test1, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')
print(classification_report(y_test1, y_pred))

Test Accuracy: 0.6900
              precision    recall  f1-score   support

           0       0.69      0.60      0.65      1556
           1       0.65      0.70      0.68      2224
           2       0.74      0.75      0.74      1717

    accuracy                           0.69      5497
   macro avg       0.69      0.69      0.69      5497
weighted avg       0.69      0.69      0.69      5497

