In [None]:
import numpy as np
import pandas as pd 
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
pd.set_option("display.max_colwidth", 200)
!pip install openai

warnings.filterwarnings("ignore")
from shutil import copyfile
copyfile(src = "/kaggle/input/attention-py/attention.py", dst = "/kaggle/working/attention.py")
from attention import AttentionLayer


In [None]:
import pandas as pd
data=pd.read_csv("/kaggle/input/cleaned-amazon-reviews/your_file.csv")
data

In [None]:
max_text_len=150
max_summary_len=12

In [None]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)
cleaned_text =np.array(data['cleaned_text'])
cleaned_summary=np.array(data['cleaned_summary'])

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
df=pd.DataFrame({'text':short_text,'summary':short_summary})
df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')

In [None]:
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0, shuffle=True) 

In [None]:
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(list(x_tr))

In [None]:
thresh=2

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in x_tokenizer.word_counts.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value
    
print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

In [None]:
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
x_tokenizer.fit_on_texts(list(x_tr))

#convert text sequences into integer sequences
x_tr_seq    =   x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq   =   x_tokenizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_tr    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_voc   =  x_tokenizer.num_words + 1

In [None]:
x_voc

In [None]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_tr))

In [None]:
thresh=3

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in y_tokenizer.word_counts.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value
    
print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

In [None]:
#prepare a tokenizer for summaries on training data
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_tr))

#convert text sequences into integer sequences
y_tr_seq    =   y_tokenizer.texts_to_sequences(y_tr) 
y_val_seq   =   y_tokenizer.texts_to_sequences(y_val) 

#padding zero upto maximum length
y_tr    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

#size of vocabulary
y_voc  =   y_tokenizer.num_words +1

In [None]:
y_tokenizer.word_counts['sostok'],len(y_tr)

In [None]:
ind=[]
for i in range(len(y_tr)):
    cnt=0
    for j in y_tr[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_tr=np.delete(y_tr,ind, axis=0)
x_tr=np.delete(x_tr,ind, axis=0)

In [None]:
ind=[]
for i in range(len(y_val)):
    cnt=0
    for j in y_val[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_val=np.delete(y_val,ind, axis=0)
x_val=np.delete(x_val,ind, axis=0)

In [None]:
from keras import backend as K 
K.clear_session()

latent_dim = 300
embedding_dim=100

# Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.2,recurrent_dropout=0.1)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.2,recurrent_dropout=0.1)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.2,recurrent_dropout=0.1)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#dense layer
decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary() 

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
model.load_weights('/kaggle/input/usable1/summary.h5')

In [None]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
contractions_dict = {"ain't": 'am not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "cuz": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is', "there'd": 'there had', "there'd've": 'there would have', "there's": 'there is', "they'd": 'they would', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you'll've": 'you will have', "you're": 'you are', "you've": 'you have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "this's": 'this is', "here's": 'here is', "ya'll": 'you all', 'gonna': 'going to', 'gotta': 'got to', 'wanna': 'want to', 'shoulda': 'should have', 'coulda': 'could have', 'woulda': 'would have', 'mighta': 'might have', 'musta': 'must have', 'oughta': 'ought to', 'dunno': 'do not know', 'kinda': 'kind of', 'sorta': 'sort of', 'gotcha': 'got you', 'gimme': 'give me', 'lemme': 'let me', 'wassup': 'what is up', "c'mon": 'come on', 'whatcha': 'what are you', 'ya': 'you', 'hafta': 'have to', 'shouldna': 'should not have', 'couldna': 'could not have', 'wouldna': 'would not have', 'mightna': 'might not have', 'mustna': 'must not have', 'oughtna': 'ought not to have', "amn't": 'am not'}
def replace_contractions(review):
    for contraction, expanded_form in contractions_dict.items():
        review = re.sub(r'\b' + re.escape(contraction) + r'\b', expanded_form, review, flags=re.IGNORECASE)

    review = re.sub(' +', ' ', review)

    return review

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def clean_review(review):
    review = review.lower()
    review = replace_contractions(review)
    review = re.sub(r'https?:\/\/.*[\r\n]*', '', review, flags=re.MULTILINE)
    review = re.sub(r'\<a href', ' ', review)
    review = re.sub(r'&amp;', '', review) 
    review = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', review)
    review = re.sub(r'<br />', ' ', review)
    review = re.sub(r'\'', ' ', review)
    
    words = word_tokenize(review)
    filtered_words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
    cleaned_review = ' '.join(filtered_words)

    return cleaned_review

In [None]:
def summarize(custom_review):
    cleaned_review = clean_review(custom_review)
    tokenized_review = x_tokenizer.texts_to_sequences([cleaned_review])

    # Step 2: Pad the sequence to match the required input length
    padded_review = pad_sequences(tokenized_review, maxlen=max_text_len, padding='post')

    # Step 3: Use the trained model for inference
    # Note: Make sure your model variable is loaded with the trained model
    summary = decode_sequence(padded_review)

    # Print the result
    print("Original Review:", custom_review)
    print("Predicted Summary:", summary)
    print("\n\n")
    return summary.strip()

In [None]:
def get_reviews(product, sentiment):
    reviews = []
    df = pd.read_csv('/kaggle/input/evaluation-dataset/model_evaluation_reviews.csv')
    for index, row in df.iterrows():
        if row['Sentiment'] == sentiment and row['Item Name'] == product:
            reviews.append(row['Review'])
    return reviews

In [None]:
ind_reviews = get_reviews('Pub Mix', 1)
len(ind_reviews)

In [None]:

predicted_summaries = []
for r in ind_reviews:
    predicted_summaries.append(summarize(r))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# Create a TF-IDF matrix for the summaries
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(predicted_summaries)

# Reduce dimensionality for visualization
lsa = TruncatedSVD(n_components=2)
X_reduced = lsa.fit_transform(X)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.01, min_samples=2)
predicted_clusters = dbscan.fit_predict(X_reduced)

# Print the predicted clusters
print("Predicted Clusters:")
cluster_info = []
for cluster_id in set(predicted_clusters):
    cluster_indices = [i for i, cluster in enumerate(predicted_clusters) if cluster == cluster_id]
    cluster_summaries = [predicted_summaries[i] for i in cluster_indices]
    
    cluster_info.append({
        "Cluster": cluster_id,
        "Summaries": cluster_summaries
    })

    print(f"Cluster {cluster_id}: {', '.join(cluster_summaries)}")

# Plot the clusters
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=predicted_clusters, cmap='viridis')
plt.title('DBSCAN Clustering of Predicted Summaries')
plt.show()

# Print the 2D list of clusters
print("\n2D List of Clusters:")
for info in cluster_info:
    print(f"Cluster {info['Cluster']}: {info['Summaries']}")


In [None]:
final_summaries = set()

for info in cluster_info:
    summary = summarize(" ".join(info['Summaries']))
    final_summaries.add(summary)


In [None]:
list(final_summaries)

In [None]:
def tags_to_sentence(tags):
    sentence_template = "Customers like this product due to its "
    sentence = sentence_template + ", ".join(tags)
    return sentence

In [None]:
final_sentence_summary = tags_to_sentence(list(final_summaries))
final_sentence_summary

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

def calculate_similarity(sentence1, sentence2):
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    embeddings = embed([sentence1, sentence2])
    similarity = np.inner(embeddings, embeddings)[0, 1]
    return similarity


In [None]:
amazon_ai_summary = "Customers like the taste, quality, value, and texture of the snack mix. They mention that the flavors are good, the texture is crunchy and savory, and the size is a great value."
amazon_ai_tags = ["good taste", "good quality", "good value", "good texture", "good size", "good packaging"]

In [None]:
calculate_similarity(amazon_ai_summary, final_sentence_summary)

In [None]:
ai_tag_sentence = tags_to_sentence(amazon_ai_tags)
ai_tag_sentence

In [None]:
calculate_similarity(ai_tag_sentence, final_sentence_summary)