In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, r2_score
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

In [5]:


# Load datasets
tweets_df = pd.read_csv("Tweets.csv")
news_df = pd.read_csv("News.csv", encoding="ISO-8859-1")
news_df.columns = ['Sentiment', 'Text']
amazon_review_df = pd.read_csv("Amazone review.csv")
amazon_review_df.columns = ['Sentiment','Title', 'Text']
amazon_review_df = amazon_review_df.drop(['Title'], axis=1)
tweets_df = tweets_df.drop(['textID','selected_text'], axis=1)
tweets_df = tweets_df.rename(columns={'text': 'Text','sentiment':'Sentiment'})

amazon_review_df = amazon_review_df.dropna()
positive_reviews = amazon_review_df[amazon_review_df['Sentiment'] == 2].sample(5000, random_state=42)
negative_reviews = amazon_review_df[amazon_review_df['Sentiment'] == 1].sample(5000, random_state=42)
# Combine positive and negative reviews
amazon_review_df = pd.concat([positive_reviews, negative_reviews])
# Shuffle the rows
amazon_review_df = amazon_review_df.sample(frac=1, random_state=42)
# amazon_review_df['Sentiment'] = amazon_review_df['Sentiment'].replace({2 :'positive', 1 : 'negative'})
news_df['Sentiment'] = news_df['Sentiment'].replace({'positive': 2, 'negative': 1, 'neutral': 0})
tweets_df['Sentiment'] = tweets_df['Sentiment'].replace({'positive': 2, 'negative': 1,'neutral': 0})



In [6]:
# Data Pre-processing remove stopwords symbols , cleaning the data.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = re.sub('{html}|<.*?>|http\S+|\$\w+[,]|\$\w+|[,]\$\w+|[0-9]+', '', text)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    stemmed = [stemmer.stem(token) for token in tokens]
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized)

news_df['Processed_Text'] = news_df['Text'].apply(preprocess)
amazon_review_df['Processed_Text'] = amazon_review_df['Text'].apply(preprocess)
tweets_df['Processed_Text'] = tweets_df['Text'].apply(preprocess)


In [130]:
### Model used Naive Bayes, SVM, Random Forest On Tweets Data 

X = tweets_df['Processed_Text'] 
y = tweets_df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    # etc for deep learning models
}


for name, model in tqdm(models.items()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f'Accuracy: {accuracy:.3f}')
    print("Mean Absolute Error:", mae)
    print("R^2 Score:", r2)
    print("Classification Report:\n", classification_report(y_test, y_pred))
# Models




  0%|          | 0/3 [00:00<?, ?it/s]

Model: Naive Bayes
Accuracy: 0.617
Mean Absolute Error: 0.5590568372025325
R^2 Score: -0.2914065745909489
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.79      0.64      5599
           1       0.77      0.39      0.52      3914
           2       0.71      0.60      0.65      4228

    accuracy                           0.62     13741
   macro avg       0.67      0.59      0.60     13741
weighted avg       0.66      0.62      0.61     13741



 67%|██████▋   | 2/3 [00:25<00:12, 12.80s/it]

Model: SVM
Accuracy: 0.679
Mean Absolute Error: 0.45964631395094974
R^2 Score: -0.043940299549467765
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.82      0.69      5599
           1       0.78      0.50      0.61      3914
           2       0.78      0.67      0.72      4228

    accuracy                           0.68     13741
   macro avg       0.72      0.66      0.67     13741
weighted avg       0.71      0.68      0.68     13741



100%|██████████| 3/3 [00:38<00:00, 12.81s/it]

Model: Random Forest
Accuracy: 0.691
Mean Absolute Error: 0.4365038934575358
R^2 Score: 0.018803443246704354
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.74      0.69      5599
           1       0.74      0.56      0.64      3914
           2       0.73      0.75      0.74      4228

    accuracy                           0.69     13741
   macro avg       0.70      0.68      0.69     13741
weighted avg       0.70      0.69      0.69     13741






In [87]:


X = amazon_review_df['Processed_Text'] 
y = amazon_review_df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    # etc for deep learning models
}


for name, model in tqdm(models.items()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f'Accuracy: {accuracy:.3f}')
    print("Mean Absolute Error:", mae)
    print("R^2 Score:", r2)
    print("Classification Report:\n", classification_report(y_test, y_pred))
# Models






  0%|          | 0/3 [00:00<?, ?it/s]

Model: Naive Bayes
Accuracy: 0.811
Mean Absolute Error: 0.1885
R^2 Score: 0.24587255246136586
Classification Report:
               precision    recall  f1-score   support

           1       0.79      0.85      0.82       987
           2       0.84      0.78      0.81      1013

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.81      0.81      0.81      2000



 67%|██████▋   | 2/3 [00:16<00:08,  8.36s/it]

Model: SVM
Accuracy: 0.835
Mean Absolute Error: 0.165
R^2 Score: 0.3398884411465537
Classification Report:
               precision    recall  f1-score   support

           1       0.83      0.84      0.83       987
           2       0.84      0.83      0.84      1013

    accuracy                           0.83      2000
   macro avg       0.83      0.84      0.83      2000
weighted avg       0.84      0.83      0.84      2000



100%|██████████| 3/3 [00:21<00:00,  7.28s/it]

Model: Random Forest
Accuracy: 0.812
Mean Absolute Error: 0.1875
R^2 Score: 0.2498732285756292
Classification Report:
               precision    recall  f1-score   support

           1       0.81      0.81      0.81       987
           2       0.82      0.81      0.81      1013

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.81      0.81      0.81      2000






In [88]:


X = news_df['Processed_Text'] 
y = news_df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    # etc for deep learning models
}


for name, model in tqdm(models.items()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f'Accuracy: {accuracy:.3f}')
    print("Mean Absolute Error:", mae)
    print("R^2 Score:", r2)
    print("Classification Report:\n", classification_report(y_test, y_pred))
# Models




  0%|          | 0/3 [00:00<?, ?it/s]

Model: Naive Bayes
Accuracy: 0.674
Mean Absolute Error: 0.543859649122807
R^2 Score: -0.22753849152212036
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.96      0.80       567
           1       1.00      0.09      0.16       115
           2       0.63      0.34      0.44       287

    accuracy                           0.67       969
   macro avg       0.77      0.46      0.47       969
weighted avg       0.70      0.67      0.62       969



 67%|██████▋   | 2/3 [00:01<00:00,  1.14it/s]

Model: SVM
Accuracy: 0.716
Mean Absolute Error: 0.47678018575851394
R^2 Score: -0.08137215902264772
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.96      0.81       567
           1       0.74      0.37      0.49       115
           2       0.78      0.37      0.50       287

    accuracy                           0.72       969
   macro avg       0.74      0.57      0.60       969
weighted avg       0.73      0.72      0.68       969



100%|██████████| 3/3 [00:03<00:00,  1.08s/it]

Model: Random Forest
Accuracy: 0.731
Mean Absolute Error: 0.45304437564499483
R^2 Score: -0.028338356965316835
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.94      0.81       567
           1       0.78      0.39      0.52       115
           2       0.76      0.45      0.57       287

    accuracy                           0.73       969
   macro avg       0.75      0.59      0.63       969
weighted avg       0.74      0.73      0.71       969






In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# For LSTM:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# For BERT:


# Tokenization and Padding
MAX_LEN = 150
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(news_df['Processed_Text'])

sequences = tokenizer.texts_to_sequences(news_df['Processed_Text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post', padding='post')

# LSTM model
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64, input_length=MAX_LEN),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

X = padded_sequences
y = news_df['Sentiment'].values

model_lstm.fit(X, y, epochs=5, validation_split=0.2, batch_size=32)

from sklearn.metrics import mean_absolute_error, r2_score, classification_report

# Predictions
with tf.device('/CPU:0'):
    y_pred_lstm = model_lstm.predict(X_test).flatten()
y_pred_class = [1 if i > 0.5 else 0 for i in y_pred_lstm]

# Metrics
accuracy = sum(y_pred_class == y_test) / len(y_test)
mae = mean_absolute_error(y_test, y_pred_class)
r2 = r2_score(y_test, y_pred_class)
class_report = classification_report(y_test, y_pred_class)

print(f'Accuracy: {accuracy}')
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")
print("Classification Report:")
print(class_report)



In [108]:


from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Change number of labels to 3

input_ids = []
attention_masks = []

for sent in news_df['Processed_Text']:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True,
                                         return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = [t.numpy()[0] for t in input_ids]
attention_masks = [t.numpy()[0] for t in attention_masks]

X_ids = tf.convert_to_tensor(input_ids)
X_masks = tf.convert_to_tensor(attention_masks)
y = tf.convert_to_tensor(news_df['Sentiment'].values)
X_ids_np = X_ids.numpy()
X_masks_np = X_masks.numpy()
y_np = y.numpy() 

X_ids_train, X_ids_test, X_masks_train, X_masks_test, y_train, y_test = train_test_split(X_ids_np, X_masks_np, y_np, test_size=0.2, random_state=42)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # Change to CategoricalCrossentropy
metric = tf.keras.metrics.CategoricalAccuracy(name='accuracy')  # Change metric for multiclass

model_bert.compile(optimizer=optimizer, loss=loss, metrics=[metric])
y_one_hot_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
model_bert.fit([X_ids_train, X_masks_train], y_one_hot_train, epochs=2, validation_split=0.2, batch_size=8)

# Predictions
y_pred_bert = model_bert.predict([X_ids_test, X_masks_test])[0]
y_pred_bert_class = np.argmax(y_pred_bert, axis=1)

# Metrics
accuracy = sum(y_pred_bert_class == y_test) / len(y_test)
mae = mean_absolute_error(y_test, y_pred_bert_class)
r2 = r2_score(y_test, y_pred_bert_class)
class_report = classification_report(y_test, y_pred_bert_class)

print(f'Accuracy: {accuracy}')
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")
print("Classification Report:")
print(class_report)



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2
Epoch 2/2
Accuracy: 0.803921568627451
Mean Absolute Error: 0.31475748194014447
R^2 Score: 0.3079735585201955
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       567
           1       0.69      0.63      0.66       115
           2       0.74      0.76      0.75       287

    accuracy                           0.80       969
   macro avg       0.76      0.75      0.76       969
weighted avg       0.80      0.80      0.80       969



In [None]:
save_directory = "C:/Users/adity/OneDrive/Jigar bhai/Jigar Final Project/Code" 
model_bert.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [111]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


# Tokenization and Padding
MAX_LEN = 150
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(tweets_df['Processed_Text'])

sequences = tokenizer.texts_to_sequences(tweets_df['Processed_Text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post', padding='post')

# LSTM model
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64, input_length=MAX_LEN),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

X = padded_sequences
y = tweets_df['Sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_lstm.fit(X, y, epochs=5, validation_split=0.2, batch_size=32)

from sklearn.metrics import mean_absolute_error, r2_score, classification_report

# Predictions
with tf.device('/CPU:0'):
    y_pred_lstm = model_lstm.predict(X_test).flatten()
y_pred_class = [1 if i > 0.5 else 0 for i in y_pred_lstm]

# Metrics
accuracy = sum(y_pred_class == y_test) / len(y_test)
mae = mean_absolute_error(y_test, y_pred_class)
r2 = r2_score(y_test, y_pred_class)
class_report = classification_report(y_test, y_pred_class)

print(f'Accuracy: {accuracy}')
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")
print("Classification Report:")
print(class_report)



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 150, 64)           640000    
                                                                 
 lstm_9 (LSTM)               (None, 150, 64)           33024     
                                                                 
 lstm_10 (LSTM)              (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 1)                 33        
                                                                 
Total params: 685,473
Trainable params: 685,473
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.28415499363289065
Mean Absolute Error: 0.7158450063671093
R^2 Score: -0.012906771085273716
Classification Report:
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Change number of labels to 3

input_ids = []
attention_masks = []

for sent in tweets_df['Processed_Text']:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True,
                                         return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = [t.numpy()[0] for t in input_ids]
attention_masks = [t.numpy()[0] for t in attention_masks]

X_ids = tf.convert_to_tensor(input_ids)
X_masks = tf.convert_to_tensor(attention_masks)
y = tf.convert_to_tensor(tweets_df['Sentiment'].values)
X_ids_np = X_ids.numpy()
X_masks_np = X_masks.numpy()
y_np = y.numpy() 

X_ids_train, X_ids_test, X_masks_train, X_masks_test, y_train, y_test = train_test_split(X_ids_np, X_masks_np, y_np, test_size=0.2, random_state=42)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # Change to CategoricalCrossentropy
metric = tf.keras.metrics.CategoricalAccuracy(name='accuracy')  # Change metric for multiclass

model_bert.compile(optimizer=optimizer, loss=loss, metrics=[metric])
y_one_hot_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
model_bert.fit([X_ids_train, X_masks_train], y_one_hot_train, epochs=1, validation_split=0.2, batch_size=8)

# Predictions
y_pred_bert = model_bert.predict([X_ids_test, X_masks_test])[0]
y_pred_bert_class = np.argmax(y_pred_bert, axis=1)

# Metrics
accuracy = sum(y_pred_bert_class == y_test) / len(y_test)
mae = mean_absolute_error(y_test, y_pred_bert_class)
r2 = r2_score(y_test, y_pred_bert_class)
class_report = classification_report(y_test, y_pred_bert_class)

print(f'Accuracy: {accuracy}')
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")
print("Classification Report:")
print(class_report)



In [127]:
tweet_df_balanced = tweets_df
positive_reviews_tweet = tweet_df_balanced[tweet_df_balanced['Sentiment'] == 2].sample(500, random_state=42)
positive_reviews_tweet = tweet_df_balanced[tweet_df_balanced['Sentiment'] == 1].sample(500, random_state=42)
neutral_reviews_tweet = tweet_df_balanced[tweet_df_balanced['Sentiment'] == 0].sample(500, random_state=42)
# Combine positive and negative reviews
tweet_df_balanced = pd.concat([positive_reviews_tweet, positive_reviews_tweet,neutral_reviews_tweet])
# Shuffle the rows
tweet_df_balanced = tweet_df_balanced.sample(frac=1, random_state=42)
tweet_df_balanced.shape

(1500, 3)

In [135]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Change number of labels to 3

input_ids = []
attention_masks = []

for sent in tweet_df_balanced['Processed_Text']:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True,
                                         return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = [t.numpy()[0] for t in input_ids]
attention_masks = [t.numpy()[0] for t in attention_masks]

X_ids = tf.convert_to_tensor(input_ids)
X_masks = tf.convert_to_tensor(attention_masks)
y = tf.convert_to_tensor(tweet_df_balanced['Sentiment'].values)
X_ids_np = X_ids.numpy()
X_masks_np = X_masks.numpy()
y_np = y.numpy() 

X_ids_train, X_ids_test, X_masks_train, X_masks_test, y_train, y_test = train_test_split(X_ids_np, X_masks_np, y_np, test_size=0.2, random_state=42)
X_ids_train = tf.cast(X_ids_train, tf.int32)
X_masks_train = tf.cast(X_masks_train, tf.int32)
y_train = tf.cast(y_train, tf.int64)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # Change to CategoricalCrossentropy
metric = tf.keras.metrics.CategoricalAccuracy(name='accuracy')  # Change metric for multiclass

model_bert.compile(optimizer=optimizer, loss=loss, metrics=[metric])
y_one_hot_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
model_bert.fit([X_ids_train, X_masks_train], y_one_hot_train, epochs=2, validation_split=0.2, batch_size=8)

# Predictions
y_pred_bert = model_bert.predict([X_ids_test, X_masks_test])[0]
y_pred_bert_class = np.argmax(y_pred_bert, axis=1)

# Metrics
accuracy = sum(y_pred_bert_class == y_test) / len(y_test)
mae = mean_absolute_error(y_test, y_pred_bert_class)
r2 = r2_score(y_test, y_pred_bert_class)
class_report = classification_report(y_test, y_pred_bert_class)

print(f'Accuracy: {accuracy}')
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")
print("Classification Report:")
print(class_report)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2
Epoch 2/2
Accuracy: 0.81
Mean Absolute Error: 0.19
R^2 Score: 0.10639632107023411
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.64      0.67        92
           1       0.85      0.88      0.87       208

    accuracy                           0.81       300
   macro avg       0.78      0.76      0.77       300
weighted avg       0.81      0.81      0.81       300



In [109]:
# First, ensure you have the necessary libraries installed
# !pip install tensorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Build the LSTM model
max_words = 5000
max_length = 100

tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(X_train_balanced)
sequences_train = tokenizer.texts_to_sequences(X_train_balanced)
sequences_test = tokenizer.texts_to_sequences(X_test_balanced)

X_train_seq = pad_sequences(sequences_train, maxlen=max_length)
X_test_seq = pad_sequences(sequences_test, maxlen=max_length)

# Convert labels to integers
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train_balanced)
y_test_int = label_encoder.transform(y_test_balanced)

# Convert integer labels to binary class matrix
y_train_cat = to_categorical(y_train_int)
y_test_cat = to_categorical(y_test_int)

X_train_seq.shape, y_train_cat.shape

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_seq, y_train_cat, validation_data=(X_test_seq, y_test_cat), epochs=5, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_seq, y_test_cat)
print(f"Accuracy: {accuracy * 100:.2f}%")

y_pred = model.predict(X_test_seq)
y_pred_classes = y_pred.argmax(axis=1)

# Classification report
print("LSTM Classification Report:")
print(classification_report(y_test_int, y_pred_classes, target_names=label_encoder.classes_))

# Mean Absolute Error and R2 Score
mae = mean_absolute_error(y_test_int, y_pred_classes)
r2 = r2_score(y_test_int, y_pred_classes)

print(f"Mean Absolute Error: {mae}")
print(f"R2 Score: {r2}")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 45.00%
LSTM Classification Report:
              precision    recall  f1-score   support

    negative       0.38      0.30      0.34        66
     neutral       0.43      0.61      0.50        67
    positive       0.56      0.43      0.49        67

    accuracy                           0.45       200
   macro avg       0.46      0.45      0.44       200
weighted avg       0.46      0.45      0.44       200

Mean Absolute Error: 0.695
R2 Score: -0.4812586939358623


In [18]:
print(train_dataset)
print(test_dataset)

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 53), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 53), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 53), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 59), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 59), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 59), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


## All Model of News Dataset!!