In [None]:
# Import all libraries
import os
import shutil
import tarfile
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from nltk.corpus import stopwords


In [None]:
# load the dataset
df = pd.read_excel("/content/new_articles_with_sentiment_Vader[1].xlsx")
df.head()

In [None]:
df["ctext"][0]

In [None]:
def rate(score):
  if score == "Positive":
    return 1
  elif score == "Neutral":
    return 0.5
  else:
    return 0

# Assign each entry with a sentiment score
df["Text"] = df["ctext"]
df["Rating"] = df["ctext_sentiment"].apply(rate)
df= df[['Text','Rating']]

df.head()

In [None]:
# text cleaning

nlp = spacy.load('en_core_web_sm')

def text_cleaning(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    stop_words = set(stopwords.words('english'))
    cleaned_text = [token for token in tokens if token not in stop_words]
    return cleaned_text

In [None]:
import nltk
nltk.download('stopwords')

df['Text'] = df['Text'].apply(text_cleaning)

In [None]:
# Split dataset into training and testing

X = df['Text']
Y = df['Rating']

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,random_state=1,test_size=0.2)

# xtrain = training set of reviews
# ytrain = training set of tags

#xtest = testing set of reviews
# ytest = testing set of tags



In [None]:
# EDA

sentiment_counts = Y.value_counts()

fig =px.bar(x= {0:'Negative',1:'Positive',0.5:'Neutral'},
            y= sentiment_counts.values,
            color=sentiment_counts.index,
            color_discrete_sequence =  px.colors.qualitative.Dark24,
            title='<b>Sentiments Counts')

fig.update_layout(title='Sentiments Counts',
                  xaxis_title='Sentiment',
                  yaxis_title='Counts',
                  template='plotly_dark')

# Show the bar chart
fig.show()

In [None]:
# splitting test into testing and validation
x_val, x_test, y_val, y_test = train_test_split(Xtest,
													Ytest,
													test_size=0.5,
													stratify = Ytest)


In [None]:
# TOKENIZATION
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_len = 128

# Tokenize and encode the sentences
# BERT Model understands text in encoded format only

encoded_train = tokenizer.batch_encode_plus(
    Xtrain.apply(lambda x: ' '.join(x)).tolist(),
    padding=True,
    truncation=True,
    max_length=max_len,
    return_tensors='tf'  # Use 'pt' if you're using PyTorch
)

encoded_val = tokenizer.batch_encode_plus(
    x_val.apply(lambda x: ' '.join(x)).tolist(),      # convert list pf tokenized strings into a single string
    padding=True,
    truncation=True,
    max_length=max_len,
    return_tensors='tf'  
)

encoded_test = tokenizer.batch_encode_plus(
    x_test.apply(lambda x: ' '.join(x)).tolist(),
    padding=True,
    truncation=True,
    max_length=max_len,
    return_tensors='tf'  
)












In [None]:
# Print out the encoded values

k = 0
print('Training Comments -->>',Xtrain[k])
print('\nInput Ids -->>\n',encoded_train['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(encoded_train['input_ids'][k]))
print('\nAttention Mask -->>\n',encoded_train['attention_mask'][k])
print('\nLabels -->>',Ytrain[k])


Training Comments -->> ['BJP', 'MP', 'Yogi', 'Adityanath', 'liken', 'Uttar', 'Pradesh', 'Chief', 'Minister', 'Akhilesh', 'Yadav', 'Aurangzeb', 'Kans', 'say', 'parent', 'desist', 'name', 'son', 'AkhileshAkhilesh', 'Aurangzeb', 'Kans', 'due', 'deed', 'parent', 'desist', 'name', 'son', 'Akhilesh', 'say', 'election', 'meeting', 'BhadohiAkhilesh', 'say', 'would', 'develop', 'state', 'next', 'five', 'year', 'give', 'chance', 'past', 'five', 'year', 'askedAurangzeb', 'controversial', 'mughal', 'ruler', 'mythological', 'character', 'Kans', 'consider', 'tyrant', 'ruler', 'Vrishni', 'kingdom', 'capital', 'mathuraup', 'GOVT', 'patronised', 'TERRORISTS', 'ADITYANATHThe', 'firebrand', 'BJP', 'MP', 'Gorakhpur', 'say', 'state', 'government', 'patronise', 'terrorist', 'antisocial', 'rapistswhatever', 'scheme', 'government', 'run', 'particular', 'community', 'allegedliste', 'work', 'BJP', 'would', 'vote', 'power', 'Adityanath', 'say', 'promote', 'traditional', 'industry', 'send', 'bangle', 'Akhilesh', 

In [None]:
# LOADING MODEL
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model with an appropriate optimizer, loss function, and metrics

# Adam optimizer is a type of gradient descent algo that updates network weights with a constant learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# Catergorical Crossentropy is the measure of how accurate predicted values are. Loss signifies measure of wrong predictions.
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# used for evaluating the accuracy of a classification model
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
# TRAINING
history = model.fit(
	[encoded_train['input_ids'], encoded_train['token_type_ids'], encoded_train['attention_mask']],
	Ytrain,
	validation_data=(
	[encoded_val['input_ids'], encoded_val['token_type_ids'], encoded_val['attention_mask']],y_val),
	batch_size=32,
	epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(
	[encoded_test['input_ids'], encoded_test['token_type_ids'], encoded_test['attention_mask']],
	y_test
)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')


In [None]:
path = 'E:\SIH_git\SIH'
# Save tokenizer
tokenizer.save_pretrained(path +'/Tokenizer')

# Save model
model.save_pretrained(path +'/Model')


In [None]:
# Testing purpose, not compulsory
# Load tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(path +'/Tokenizer')

# Load model
bert_model = TFBertForSequenceClassification.from_pretrained(path +'/Model')


In [None]:
pred = bert_model.predict(
	[encoded_test['input_ids'], encoded_test['token_type_ids'], encoded_test['attention_mask']])

# pred is of type TFSequenceClassifierOutput
logits = pred.logits

# Use argmax along the appropriate axis to get the predicted labels
pred_labels = tf.argmax(logits, axis=1)

# Convert the predicted labels to a NumPy array
pred_labels = pred_labels.numpy()

label = {
	1: 'positive',
	0: 'Negative',
  0.5:'Neutral'
}

# Map the predicted labels to their corresponding strings using the label dictionary
pred_labels = [label[i] for i in pred_labels]
Actual = [label[i] for i in y_test]

print('Predicted Label :', pred_labels[:10])
print('Actual Label :', Actual[:10])


In [None]:
print("Classification Report: \n", classification_report(Actual, pred_labels))
