This Notebook is used to generate summaries with the baselines models `TextRank` and `GPT2`

In [1]:
import gensim
from gensim.summarization.summarizer import summarize as textrank
from summarizer import Summarizer, TransformerSummarizer

import pandas as pd
import re
import math

  from .autonotebook import tqdm as notebook_tqdm


In [54]:
raw_data = pd.read_csv("./20220420_amazon_reviews_test.csv")
raw_data.shape

(3105, 7)

In [55]:
raw_data["num_sents"] = raw_data["review"].apply(lambda x: len(re.split("[\.\!\?]", str(x))))

In [56]:
raw_data.head()

Unnamed: 0,category,prod_id,rating,polarity,review,review_len,review_id,num_sents
0,Cell_Phones_and_Accessories,B003GLIDRM,1.0,negative,It doesn't work. The LCD has no back light tr...,30,0,4
1,Cell_Phones_and_Accessories,B003GLIDRM,5.0,positive,I damaged my LCD screen while replacing a brok...,54,1,5
2,Cell_Phones_and_Accessories,B003GLIDRM,5.0,positive,I am very pleased with this dealer and their p...,67,2,6
3,Cell_Phones_and_Accessories,B003GLIDRM,5.0,positive,This LCD works like a champ. I'm very happy w...,60,3,8
4,Cell_Phones_and_Accessories,B003GLIDRM,5.0,positive,I ordered this item on July 8th and received i...,35,4,6


In [57]:
def concat(arr):
    arr = arr.values.flatten().tolist()
    text = "\n".join(arr)
    return text

In [58]:
def count_sentences(arr):
    max_sents = 0
    num_sents = 0
    for s in arr.values.flatten():
        num_sents += len(re.split("[\.\!\?]", s))
        max_sents = max(max_sents, num_sents)
    return num_sents / arr.shape[0] / max_sents

In [59]:
def gpt2_summarizer(corpus, max_length=70):
    #corpus = [corpus]
    gpt2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
    output = gpt2_model(corpus, min_length=15, max_length=max_length)
    summary = ''.join(output)
    return [summary]

In [60]:
def textrank_summarizer(corpus, word_count=70):
    corpus = [corpus]    
    lst_summaries = [gensim.summarization.summarizer.summarize(txt, word_count=word_count) for txt in corpus]    
    return lst_summaries

In [61]:
df = raw_data[["prod_id", "review", "review_len"]].groupby(["prod_id"], as_index=False).agg(concat).copy()

  df = raw_data[["prod_id", "review", "review_len"]].groupby(["prod_id"], as_index=False).agg(concat).copy()


In [62]:
df["min_sents"] = raw_data[["prod_id", "num_sents"]].groupby(["prod_id"], as_index=False).min()["num_sents"]
df["max_sents"] = raw_data[["prod_id", "num_sents"]].groupby(["prod_id"], as_index=False).max()["num_sents"]
df["avg_sents"] = raw_data[["prod_id", "num_sents"]].groupby(["prod_id"], as_index=False).mean()["num_sents"]
df["min_len"] = raw_data[["prod_id", "review_len"]].groupby(["prod_id"], as_index=False).min()["review_len"]
df["max_len"] = raw_data[["prod_id", "review_len"]].groupby(["prod_id"], as_index=False).max()["review_len"]
df["avg_len"] = raw_data[["prod_id", "review_len"]].groupby(["prod_id"], as_index=False).mean()["review_len"]
df.head()

Unnamed: 0,prod_id,review,min_sents,max_sents,avg_sents,min_len,max_len,avg_len
0,B0002VQDVM,It seems strange that Samsung would try so har...,3,15,7.8,19,190,96.666667
1,B0009MYS9S,I have used this for only a week on two charge...,3,14,7.2,14,128,68.6
2,B000BI4KKM,I have really enjoyed this headset. I was very...,2,13,6.4,17,145,64.733333
3,B000CQFMEQ,Okay the only problem I have with this phone i...,1,13,6.333333,22,186,88.866667
4,B000FYU4SO,"Before this phone, I had only LG, the latest b...",7,27,11.333333,69,199,128.0


In [63]:
def summarizer(data, method="gpt"):
    df = []
    for row in data.values:
        prod_id = row[0]
        text = row[1]
        max_len = int(round(row[7], 0))
        ratio = row[4] / row[3]
        if method == "textrank":
            summary = textrank_summarizer(corpus=text, word_count=70)
        elif method == "gpt":
            summary = gpt2_summarizer(corpus=text, max_length=70)
        num_sents = len(re.split(r"[\.\!\?]", summary[0]))
        df.append([prod_id, summary[0], num_sents])
    df = pd.DataFrame(df, columns=["prod_id", "summary", "num_sents"])
    return df

In [64]:
#df = df[df["prod_id"] == "B003ZHPWKS"]
#df.head()

In [65]:
summaries = summarizer(df, method="textrank")

In [66]:
summaries

Unnamed: 0,prod_id,summary,num_sents
0,B0002VQDVM,It seems strange that Samsung would try so har...,7
1,B0009MYS9S,2nd replacement set cracked after about two we...,6
2,B000BI4KKM,"Good design, especially the earpiece but somet...",6
3,B000CQFMEQ,"The range on this headset is pretty good, I sa...",2
4,B000FYU4SO,If you don't mind HUGE lies about battery life...,4
...,...,...,...
202,B00M179WOU,I really think you need to try this book for y...,3
203,B00MF8BJQE,Written during a different era when a persons ...,3
204,B00MN06YDM,And...to know this book is loosely based on Ke...,8
205,B00MWDIUKU,Very helpful and informative little book.\nGoo...,7


In [67]:
summaries.to_csv("./textrank_summaries.csv", index=False)