In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
import pandas as pd
df = pd.read_csv("review_data_new.csv")

In [None]:
df.head()

In [None]:
df_cleaned = df[["star_rating",'review_text']]

In [None]:
df_cleaned.head()

In [None]:
# Function to map stars to sentiment
def map_sentiment(stars_received):
    if stars_received <= 3:
        return 0
    elif stars_received <= 4:
        return 1
    else:
        return 2
# Mapping stars to sentiment into three categories
df_cleaned['sentiment'] = [ map_sentiment(x) for x in df_cleaned['star_rating']]
print("Number of rows per star rating:")
print(df_cleaned['sentiment'].value_counts())

# Plotting the sentiment distribution
plt.figure()
pd.value_counts(df_cleaned['sentiment']).plot.bar(title="Sentiment distribution in df")
plt.xlabel("Sentiment")
plt.ylabel("No. of rows in df")
plt.show()

In [None]:
df_cleaned

In [None]:
df_cleaned.rename(columns = {'review_text':'sentence'}, inplace = True)

In [None]:
sentiment_counts = df_cleaned['sentiment'].value_counts()
 
fig =px.bar(x= {0:'Negative',1:'Neutral',2:'Positive'},
            y= sentiment_counts.values,
            color=sentiment_counts.index,
            color_discrete_sequence =  px.colors.qualitative.Dark24,
            title='<b>Sentiments Counts')
 
fig.update_layout(title='Sentiments Counts',
                  xaxis_title='Sentiment',
                  yaxis_title='Counts',
                  template='plotly_dark')
 
# Show the bar chart
fig.show()
pyo.plot(fig, filename = 'Sentiments Counts.html', auto_open = True)

In [None]:
df_cleaned = df_cleaned.dropna(subset=['sentence'])

In [None]:
def text_cleaning(text):
	soup = BeautifulSoup(text, "html.parser")
	text = re.sub(r'\[[^]]*\]', '', soup.get_text())
	pattern = r"[^a-zA-Z0-9\s,']"
	text = re.sub(pattern, '', text)
	return text


In [None]:
type(df_cleaned.sentence[0])

In [None]:
# Train dataset
df_cleaned['Cleaned_sentence'] = df_cleaned['sentence'].apply(text_cleaning).tolist()

In [None]:
df_cleaned.Cleaned_sentence

In [None]:
# Function to generate word cloud
def generate_wordcloud(text,Title):
    all_text = " ".join(text)
    wordcloud = WordCloud(width=800, 
                          height=400,
                          stopwords=set(STOPWORDS), 
                          background_color='black').generate(all_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(Title)
    plt.show()

In [None]:
positive = df_cleaned[df_cleaned['sentiment']==2]['Cleaned_sentence'].tolist()
generate_wordcloud(positive,'Positive Review')

In [None]:
neutral = df_cleaned[df_cleaned['sentiment']==1]['Cleaned_sentence'].tolist()
generate_wordcloud(positive,'Neutral Review')

In [None]:
negative = df_cleaned[df_cleaned['sentiment']==0]['Cleaned_sentence'].tolist()
generate_wordcloud(positive,'Negative Review')

In [None]:
x_train, x_test_pre, y_train, y_test_pre = train_test_split(df_cleaned['Cleaned_sentence'], df_cleaned['sentiment'], test_size=0.3, random_state=42)

In [None]:
x_val, x_test, y_val, y_test = train_test_split(x_test_pre,
                                                    y_test_pre,
                                                    test_size=0.5, 
                                                    stratify = y_test_pre)

In [None]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [None]:
max_length = df_cleaned['Cleaned_sentence'].apply(len).max()
print(f"The maximum length of strings in the column is: {max_length}")


In [None]:
max_len= 120
# Tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(x_train.tolist(),
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')

X_val_encoded = tokenizer.batch_encode_plus(x_val.tolist(), 
                                              padding=True, 
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

X_test_encoded = tokenizer.batch_encode_plus(x_test.tolist(), 
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')


In [None]:
x_train.values

In [None]:
Reviews = x_train.values
Target = y_train.values

In [None]:
k = 0
print('Training Comments -->>',Reviews[k])
print('\nInput Ids -->>\n',X_train_encoded['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(X_train_encoded['input_ids'][k]))
print('\nAttention Mask -->>\n',X_train_encoded['attention_mask'][k])
print('\nLabels -->>',Target[k])


In [None]:
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


In [None]:
# Compile the model with an appropriate optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
# Step 5: Train the model
history = model.fit(
	[X_train_encoded['input_ids'], X_train_encoded['token_type_ids'], X_train_encoded['attention_mask']],
	Target,
	validation_data=(
	[X_val_encoded['input_ids'], X_val_encoded['token_type_ids'], X_val_encoded['attention_mask']],y_val),
	batch_size=32,
	epochs=3
)


In [None]:
#Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(
	[X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']],
	y_test
)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')


In [None]:
path = ''
# Save tokenizer
tokenizer.save_pretrained(path +'/Tokenizer')

# Save model
model.save_pretrained(path +'/Model')


In [None]:
# Load tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(path +'/Tokenizer')

# Load model
bert_model = TFBertForSequenceClassification.from_pretrained(path +'/Model')


In [None]:
pred = bert_model.predict(
	[X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']])

# pred is of type TFSequenceClassifierOutput
logits = pred.logits

# Use argmax along the appropriate axis to get the predicted labels
pred_labels = tf.argmax(logits, axis=1)

# Convert the predicted labels to a NumPy array
pred_labels = pred_labels.numpy()

label = {
	1: 'positive',
	0: 'Negative'
}

# Map the predicted labels to their corresponding strings using the label dictionary
pred_labels = [label[i] for i in pred_labels]
Actual = [label[i] for i in y_test]

print('Predicted Label :', pred_labels[:10])
print('Actual Label :', Actual[:10])


In [None]:
print("Classification Report: \n", classification_report(Actual, pred_labels))
