## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import transformers
import time

## Reading Data

In [2]:
df = pd.read_excel(r'C:\Users\namra\Downloads\sony-headphones.xlsx')
df['body']=df['body'].apply(str)
stop_words = stopwords.words('english')

## Loading GloVe Word Embeddings

In [None]:
word_embeddings = {}
    f = open('glove.6B\glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()

In [125]:
# Removing Stopwords like : 'i', 'me', 'my', 'myself', 'we', 'our', 'ours'
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [126]:
def my_summarizer(dataframe):
    
    sentences = []
    for s in dataframe['body']:
        sentences.append(sent_tokenize(s))    #Tokeninzing the sentences
    
    sentences = [y for x in sentences for y in x] # Flattening the sentences list
    
    #Replacing Non-Alphabetical characters with empty string
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ", regex=True) 
    clean_sentences = [s.lower() for s in clean_sentences] # Converting sentences to Lower-Case
    
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    
    # Vector Representation of Sentences
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    
    print(len(sentence_vectors))
    # Similarity Matrix Preparation using cosine similarity
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
                
    # Applying PageRank Algorithm
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    
    
    
    
    sn1 = 10
    sn2 = 5
    sn3 = 1
    
    try:
        text = ''
        for i in range(sn1):
            text = text + ranked_sentences[i][1]  
    except:
        try: 
            text = ''
            for i in range(sn2):
                text = text + ranked_sentences[i][1]      
        except:
            try:
                text = ''
                for i in range(sn3):
                    text = text + ranked_sentences[i][1]
            except:
                pass
      
    return text
        
    

In [None]:
df1 = df[df['rating']==1]
df2 = df[df['rating']==2]
df3 = df[df['rating']==3]
df4 = df[df['rating']==4]
df5 = df[df['rating']==5]

text1 = my_summarizer(df1)
text2 = my_summarizer(df2)
text3 = my_summarizer(df3)
text4 = my_summarizer(df4)
text5 = my_summarizer(df5)  

1901
1791


## Applying Hugging Face Transformers for Text Summarization

In [None]:
summarizer = transformers.pipeline("summarization")

In [None]:
summarized1 = summarizer(text1, min_length=75, max_length=300)
summarized2 = summarizer(text2, min_length=75, max_length=300)
summarized3 = summarizer(text3, min_length=75, max_length=300)
summarized4 = summarizer(text4, min_length=75, max_length=300)
summarized5 = summarizer(text5, min_length=75, max_length=300)

In [None]:
countList =[]
percentageList = []
i=1
while i<=5:
    countList.append(len(df[df['rating']==i]))
    calculatedValue = (round(float((len(df[df['rating']==i]))/len(df)),3))*100
    percentageList.append(calculatedValue)
    i=i+1

In [None]:
df_review = pd.DataFrame(columns = ['product', 'rating', 'summary'])
df_review = df_review.append({'product': df['product'][1], 'rating': '1', 'summary': summarized1,'Counts':countList[0],'Percentage of Total':percentageList[0]}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '2', 'summary': summarized2,'Counts':countList[1],'Percentage of Total':percentageList[1]}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '3', 'summary': summarized3,'Counts':countList[2],'Percentage of Total':percentageList[2]}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '4', 'summary': summarized4,'Counts':countList[3],'Percentage of Total':percentageList[3]}, ignore_index = True)
df_review = df_review.append({'product': df['product'][1], 'rating': '5', 'summary': summarized5,'Counts':countList[4],'Percentage of Total':percentageList[4]}, ignore_index = True)

In [None]:
print(df_review)

## Saving the Summarized Text to Excel

In [None]:
timestr = time.strftime("%Y%m%d-%H%M%S")
df_review.to_excel('sony-headphones-review %s.xlsx' %timestr, index = False, sheet_name='review')
print('File \'sony-headphones-review %s.xlsx\' saved successfully' %timestr)