# Naive Bayes Classifier using sklearn

In [None]:
!pip install transformers scikit-learn tensorflow datasets keras



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset

# Load Twitter Sentiment Analysis dataset from Hugging Face
dataset = load_dataset("carblacac/twitter-sentiment-analysis")

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset['train']['text'], dataset['train']['feeling'], test_size=0.2, random_state=42
)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, train_labels)

# Make predictions on the test set
nb_predictions = nb_classifier.predict(X_test)

# Evaluate the model
print("Naive Bayes Classifier Accuracy:", accuracy_score(test_labels, nb_predictions))
print("Classification Report:\n", classification_report(test_labels, nb_predictions))


Naive Bayes Classifier Accuracy: 0.7637303108592383
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.78      0.77     11954
           1       0.77      0.75      0.76     12044

    accuracy                           0.76     23998
   macro avg       0.76      0.76      0.76     23998
weighted avg       0.76      0.76      0.76     23998



# Support Vector Machine (SVM) using sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset

# Load Twitter Sentiment Analysis dataset from Hugging Face
dataset = load_dataset("carblacac/twitter-sentiment-analysis")

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset['train']['text'], dataset['train']['feeling'], test_size=0.2, random_state=42
)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Train SVM Classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, train_labels)

# Make predictions on the test set
svm_predictions = svm_classifier.predict(X_test)

# Evaluate the model
print("SVM Classifier Accuracy:", accuracy_score(test_labels, svm_predictions))
print("Classification Report:\n", classification_report(test_labels, svm_predictions))


SVM Classifier Accuracy: 0.7811900991749312
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.77      0.78     11954
           1       0.78      0.79      0.78     12044

    accuracy                           0.78     23998
   macro avg       0.78      0.78      0.78     23998
weighted avg       0.78      0.78      0.78     23998



# Bi-LSTM model using Tensorflow Keras

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load Twitter Sentiment Analysis dataset from Hugging Face
dataset = load_dataset("carblacac/twitter-sentiment-analysis")

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset['train']['text'], dataset['train']['feeling'], test_size=0.2, random_state=42
)


# Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_length = 200
train_padded = pad_sequences(train_sequences, maxlen=max_length, truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating='post')

# Build Bi-LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Convert labels to Numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Train the model
model.fit(train_padded, train_labels, epochs=5, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print("Bi-LSTM Model Accuracy:", test_acc)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Bi-LSTM Model Accuracy: 0.7831069231033325
