In [45]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import transformers
import time

In [124]:
df = pd.read_excel(r'sony-headphones.xlsx', 'Sheet1')
df['body']=df['body'].apply(str)
stop_words = stopwords.words('english')

In [125]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [126]:
def my_summarizer(dataframe):
    
    sentences = []
    for s in dataframe['body']:
        sentences.append(sent_tokenize(s))
    
    sentences = [y for x in sentences for y in x]
    
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ", regex=True)
    clean_sentences = [s.lower() for s in clean_sentences]
    
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    word_embeddings = {}
    f = open('glove.6B\glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    
    print(len(sentence_vectors))
    
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
        
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    
    
    
    
    sn1 = 10
    sn2 = 5
    sn3 = 1
    
    try:
        text = ''
        for i in range(sn1):
            text = text + ranked_sentences[i][1]  
    except:
        try: 
            text = ''
            for i in range(sn2):
                text = text + ranked_sentences[i][1]      
        except:
            try:
                text = ''
                for i in range(sn3):
                    text = text + ranked_sentences[i][1]
            except:
                pass
      
    return text
        
    

In [127]:
df1 = df[df['rating']==1]
df2 = df[df['rating']==2]
df3 = df[df['rating']==3]
df4 = df[df['rating']==4]
df5 = df[df['rating']==5]

text1 = my_summarizer(df1)
text2 = my_summarizer(df2)
text3 = my_summarizer(df3)
text4 = my_summarizer(df4)
text5 = my_summarizer(df5)  

1901
1791
2469
2830
7958


In [128]:
summarizer = transformers.pipeline("summarization")

No model was supplied, defaulted to t5-small (https://huggingface.co/t5-small)
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [129]:
summarized1 = summarizer(text1, min_length=75, max_length=300)
summarized2 = summarizer(text2, min_length=75, max_length=300)
summarized3 = summarizer(text3, min_length=75, max_length=300)
summarized4 = summarizer(text4, min_length=75, max_length=300)
summarized5 = summarizer(text5, min_length=75, max_length=300)

Token indices sequence length is longer than the specified maximum sequence length for this model (857 > 512). Running this sequence through the model will result in indexing errors


In [130]:
df_review = pd.DataFrame(columns = ['product', 'rating', 'summary'])
df_review = df_review.append({'product': df['product'][1], 'rating': '1', 'summary': summarized1}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '2', 'summary': summarized2}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '3', 'summary': summarized3}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '4', 'summary': summarized4}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '5', 'summary': summarized5}, ignore_index = True)

In [131]:
print(df_review)

                                             product rating  \
0  Sony WH-1000XM4 Wireless Industry Leading Nois...      1   
1  Sony WH-1000XM4 Wireless Industry Leading Nois...      2   
2  Sony WH-1000XM4 Wireless Industry Leading Nois...      3   
3  Sony WH-1000XM4 Wireless Industry Leading Nois...      4   
4  Sony WH-1000XM4 Wireless Industry Leading Nois...      5   

                                             summary  
0  [{'summary_text': 'this is a common issue for ...  
1  [{'summary_text': 'the bose QC35s block/cancel...  
2  [{'summary_text': 'the urethane ear pads on ot...  
3  [{'summary_text': 'this is NOT an 'audiophile'...  
4  [{'summary_text': 'the sound quality is amazin...  


In [132]:
timestr = time.strftime("%Y%m%d-%H%M%S")
df_review.to_excel('sony-headphones-review %s.xlsx' %timestr, index = False, sheet_name='review')
print('File \'sony-headphones-review %s.xlsx\' saved successfully' %timestr)

File 'sony-headphones-review 20211128-214526.xlsx' saved successfully
