In [None]:
# ...................................Finetune Dataset with BERT and get embeddings of tokens to find similar terms in domain.............
import tensorflow as tf

import torch

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import pandas as pd

# Load the dataset into a pandas dataframe.
dft = pd.read_csv("dataset.csv")

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.

dft.loc[dft['Rating'] ==1, 'Rating'] = 0
dft.loc[dft['Rating'] ==2, 'Rating'] = 0
dft.loc[dft['Rating'] ==3, 'Rating'] = 1
dft.loc[dft['Rating'] ==4, 'Rating'] = 2
dft.loc[dft['Rating'] ==5, 'Rating'] = 2

dft = dft[~dft.eq('').any(axis=1)]
dft.reset_index(drop=True, inplace=True)

dft = dft.dropna()

sentences = dft.Review.values
labels = dft.Rating.values

from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

import torch
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

from torch.utils.data import TensorDataset, random_split
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

new_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.to("cuda")

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup

epochs = 10

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


import random
import numpy as np
import torch

seed_val = 42 # this is updated with each iteration of test samples

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

early_stopping_patience = 3  # Number of epochs to wait before stopping
best_validation_loss = float('inf')
early_stopping_counter = 0
token_emb = []

for epoch_i in range(0, epochs):
    token_embeddings_train = []

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels,
                        return_dict=True
                        )

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    print("")
    print("Running Validation...")

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels, return_dict=True)

        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    if avg_val_loss < best_validation_loss:
        best_validation_loss = avg_val_loss
        early_stopping_counter = 0

    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered! No improvement in validation loss for {} epochs.".format(
                early_stopping_patience))
            break

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy
        }
    )
print("")
print("Training complete!")

# ............................................... Function to Obtain BERT Token Embeddings for terms......................................

def get_bert_embeddings(dfrows):
    word_ids_list=[]
    embeddings_list=[]
    word_names_list=[]
    for text in dfrows:
        tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        tokens = {key: value.to(model.device) for key, value in tokens.items()}
        with torch.no_grad():
            outputs = model(**tokens)
        word_ids = tokens["input_ids"].tolist()[0]
        hidden_states = outputs.hidden_states
        embeddings = hidden_states[-1]
        word_names = tokenizer.convert_ids_to_tokens(word_ids)
        word_ids_list.append(word_ids)
        embeddings_list.append(embeddings)
        word_names_list.append(word_names)
    return zip(word_ids_list, word_names_list, embeddings_list)


# ............................................... Preprocessing to eliminate noise.......................................
def pre_preprocess(text):
    import string
    import nltk
    nltk.download('punkt')
    nltk.download('popular')
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # Sample text
    #text = "Hello, this is an example sentence! It includes punctuation and stopwords."
    textx= text
    # Remove punctuation
    textx = textx.translate(str.maketrans('', '', string.punctuation)).replace(',', '')


    # Tokenize the text
    tokens = word_tokenize(textx)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a single string
    processed_text = " ".join(tokens)

    return processed_text

#.........................................Extract token embeddings of samples..............................................................
dfttt= dft
import torch
review_list2 = (dfttt['Review'].astype(str)).tolist()
zip2= get_bert_embeddings(review_list2)

zipped_list1 = list(zip2)
unzipped_list11, unzipped_list22,unzipped_list33 = zip(*zipped_list1)

idl1= list(unzipped_list11)
nl1 = list(unzipped_list22)
embl1 = list(unzipped_list33)

column_names = ['No', 'reviews_given','senti_words','all_words','senti_word_synonyms','sep_words','Rating']
senti_df1 = pd.DataFrame(columns = column_names)

#......................... Initial pre-processing to remove special tokens like cls and punctuations. Then create reviews with preprocessed text as reviews_given.................
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

words_to_remove = ['[PAD]', '[SEP]','.','[CLS]','\\','/','(',')',',','\'']
appended_list=[]
k=0
for list1 in nl1:
    filtered_word_list = [word for word in list1 if word not in words_to_remove]
    token_string = ", ".join(filtered_word_list)
    preprocessed_reviews= pre_preprocess(token_string)

    x= preprocessed_reviews.split(" ")
    appended_list.append(x)
    if senti_df1.empty:
        senti_df1.at[k, 'No'] = k
        senti_df1.at[k, 'reviews_given']= preprocessed_reviews
    else:
        senti_df1.at[k, 'No'] = k
        senti_df1.at[k, 'reviews_given']= preprocessed_reviews

    k=k+1

#...........................Preprocessing to extract adjectives and adverbs.Then create body_new column..................................................
import spacy
nlp = spacy.load('en_core_web_trf')
def preprocess(text):
    doc= nlp(text)
    x=[token.text.lower() for token in doc if token.pos_ == "ADJ" or token.pos_ == "ADV"]
    return ' '.join(x)

senti_df1['body_new']=senti_df1.reviews_given.astype(str).apply(preprocess)
senti_df1['Rating'] = dfttt['Rating']

new_list = appended_list
senti_df=senti_df1
nl= nl1
embl= embl1

flattened_list = [item for sublist in new_list for item in sublist]
unique_word_set = set(flattened_list)

# Convert the set back to a list
unique_word_list = list(unique_word_set)
senti_df = pd.concat([senti_df, pd.DataFrame(columns=unique_word_list)])
words_string = ' '.join(unique_word_list)


xxx= preprocess(words_string)
xxx_lict= xxx.split(' ')

common_values = list(set(unique_word_list) & set(xxx_lict))

#.....................................Seperate out the sentiment POS terms and the POS terms appeard in original reviews similar to such extracted terms. This is because preprocessing can make steming of tokens etc......
import math
import numpy as np
import re
for i in range(len(nl)):
    adjdic={}
    dic={}
    dic1={}
    for j in nl[i]:
        if j in senti_df.columns:
            indexes_of_value = [index for index, value in enumerate(nl[i]) if value == j]
            values_at_indexes = [embl[i][0][index] for index in indexes_of_value]
            if senti_df.empty:
                senti_df.at[i, j] = values_at_indexes
            else:
                senti_df.at[i, j] = values_at_indexes
            key1 = f'{j}'
            value1 = values_at_indexes
            dic1[key1] = value1
            if len(j) > 1:
                pattern = re.compile(rf'\b{re.escape(j)}\b', re.IGNORECASE)
                match = re.search(pattern, senti_df['body_new'][i])

                if match:
                    key = f'{j}'
                    value = values_at_indexes
                    dic[key] = value
                    k=f'{j}'
                    v=len(values_at_indexes)
                    adjdic[k]=v
    senti_df.at[i, 'senti_words']=dic
    senti_df.at[i, 'all_words']=dic1
    senti_df.at[i, 'senti_word_synonyms']=adjdic

selected_columns = ['reviews_given','senti_words','all_words','body_new','senti_word_synonyms','sep_words','Rating']
new_df = senti_df[selected_columns]


# ...........................................Seperate out the non Sentiment POS terms to find similar terms....................................

import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None
def compare_dictionaries(dict1, dict2):
    different_dict = {}

    if len(dict1) != len(dict2):
        extra_keys = set(dict1) ^ set(dict2)
        for key in extra_keys:
            if key in dict1:
                different_dict[key] = dict1[key]
            else:
                different_dict[key] = dict2[key]

    return different_dict

for i in range(len(new_df)):
    dicx= new_df['senti_words'][i]
    dicy=new_df['all_words'][i]
    new_df['sep_words'][i]=compare_dictionaries(dicx, dicy)

columns_to_exclude = ['reviews_given', 'senti_words', 'all_words', 'body_new', 'senti_word_synonyms','No','sep_words','Rating']



#....................................... Create a new DataFrame without the excluded columns.................................

newest_df = senti_df[[col for col in senti_df.columns if col not in columns_to_exclude]]

df_filled=newest_df.fillna(0)
df_filled

final_df = df_filled.copy()
final_df = final_df.astype(object)
for index, row in final_df.iterrows():
    for i, value in enumerate(row):
        if(value!=0):
            column_name = final_df.columns[i]
            final_df.at[index, column_name]= len(value)
final_df

sentiment_data = final_df[common_values]
sentiment_data

sentiment_data['sep_words'] = new_df['sep_words']

#....................................................# Find similar non sentiment words from other terms with different cosine similarity tresholds....................

def similar_t(cvd,dffilled):
    cvd = cvd.astype(object)
    dffilled = dffilled.astype(object)
    k=0
    num=0
    for index, row in cvd.iterrows():
        for i, value in enumerate(row):
            if value == 0:
                column_name = cvd.columns[i]
                non_zero_values_list = dffilled[dffilled[column_name] != 0][column_name].tolist()
                num=0
                for index1, list_val in enumerate(non_zero_values_list):
                    syn_set=list_val
                    for index1, list_val1 in enumerate(syn_set):
                        sub_synset=list_val1
                        for key, value in cvd['sep_words'][index].items():
                            sep_w_syn_set=value
                            for index2, list_val2 in enumerate(sep_w_syn_set):
                                cosine_similarity_value = torch.nn.functional.cosine_similarity(sub_synset, list_val2, dim=0)
                                if cosine_similarity_value>0.8:
                                    num=num+1
            cvd.at[k,column_name] = num
        k=k+1
    return cvd


a= sentiment_data
a1= df_filled
a.reset_index(drop=True, inplace=True)
adf=similar_t(a,a1)
adf.to_csv('dense_matrix.csv') # Save the dense matrix to csv file.