In [None]:
!pip install git+https://github.com/luanssouza/recsummarizer -q
!pip install stanza -q

In [None]:
!wget https://raw.githubusercontent.com/luanssouza/recsummarizer/main/resources/BNC_nouns.csv -q

In [None]:
import stanza
import pandas as pd

In [None]:
stanza.download('en', verbose=False) # download English model
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,sentiment', verbose=False) # initialize English neural pipeline

In [None]:
from recsummarizer.review import StanzaReview
from recsummarizer.normalize.tf_idf_normalizer import TfIdfNormalizer
from recsummarizer.persistence.stanza_persistence import StanzaPersistence
from recsummarizer.preprocess.stanza_preprocess import StanzaPreProcess
from recsummarizer.centroid.centroid import Centroid
from recsummarizer.embedding.word2vec_embedding import Word2VecEmbedding
from recsummarizer.embedding.bert_embedding import BertEmbedding
from recsummarizer.extractor import epsilon_aspects_extraction
from recsummarizer.item import StanzaItem
from recsummarizer.corpus import CsvGeneralCorpus

In [None]:
review = StanzaReview("Barack Obama was born in Hawaii. The director is really good.", nlp)

print(review.raw_review)
print(review.nouns_occurrences)
print(review.sentences[0])

Barack Obama was born in Hawaii.The director is really good.
Counter({'barack': 1, 'obama': 1, 'hawaii': 1, 'director': 1})
Barack Obama was born in Hawaii.


In [None]:
print(review.sentences[1].sentiment)

2


In [None]:
raw_reviews = [
    "The director is really good. The movie is awesome! You will definetly enjoy it! The scenes are the best!", 
    "The racer is really good."
    ]

items = [
    {"id": 0, "reviews": raw_reviews }
]

review = StanzaReview(raw_reviews[0], nlp)

print(review.raw_review)
print(review.nouns_occurrences)

The director is really good.The movie is awesome!You will definetly enjoy it!The scenes are the best!
Counter({'director': 1, 'movie': 1, 'scenes': 1})


In [None]:
general_corpus = CsvGeneralCorpus(pd.read_csv('./BNC_nouns.csv', index_col='noun'))

item = StanzaItem(0, raw_reviews, general_corpus, nlp)

item.kl_values()

item.aspects_score = epsilon_aspects_extraction(item.kl_nouns_values, -20)

item.top_k_aspects_evaluation(20)

item.sentence_filtering()

print(item.aspects_score)

print(item.filtered_sentences)
print(item.filtered_sentences_nn)

{'director': -9.711842668281289, 'movie': -7.943427767876373, 'scenes': inf, 'racer': -5.420534999272286}
['The director is really good.', 'The scenes are the best!', 'The racer is really good.']
[('The director is really good.', <recsummarizer.sentence.sentence.Sentence object at 0x7f1da3716e10>), ('The scenes are the best!', <recsummarizer.sentence.sentence.Sentence object at 0x7f1da3711350>), ('The racer is really good.', <recsummarizer.sentence.sentence.Sentence object at 0x7f1da3711690>)]


In [None]:
# https://www.sbert.net/docs/pretrained_models.html
embedding = BertEmbedding('all-MiniLM-L6-v2') 

In [None]:
# Creating normalizer instance
normalizer = TfIdfNormalizer()

# Creating centroid instance
centroid = Centroid(normalizer, 0.35)

# Creating persistence instance
persistence = StanzaPersistence('./data/', embedding, centroid)

# Creating a instance of PreProcess
preprocess = StanzaPreProcess(-20, 5)

# Preprocessing movies
preprocess.proprocess(items, persistence, general_corpus, nlp)

Item processed: 0




In [None]:
from recsummarizer.summarize.summarizer_baseline import SummarizerBaseline


print(SummarizerBaseline('./data/', 0.90, 5).summarize(0))

['The racer is really good.', 'The director is really good.', 'The scenes are the best!']


In [None]:
from recsummarizer.summarize.summarizer_clusters_frequency import SummarizerClustersFrequency

print(SummarizerClustersFrequency('./data/', 0.90, 5).summarize(0, 3))

['The racer is really good.', 'The director is really good.', 'The scenes are the best!']
