In [14]:
%%writefile ../my_module/Start.py

"""
Contains functionality for selecting most relevant vines and for summarization users description.
"""

from normalize_text import normalize
from text_cleanup import cleanup
from generate_summary import summary
from search_by_description import top_search_by_description

print("1: Obtaining a summarization from the description of wine\n2: Obtaining the top-5 wines that best match the description")
choice = input("Select work option: ")
while(True):
    if choice == '1':
        text = input("Enter text to summarize: ")
        print("\nOriginal Text:\n")
        print(text)
        print('\nSummarized text:\n')
        print(summary(cleanup(normalize(text))[0], normalize(text)))
        break
    elif choice == '2':
        text = input("Enter text to summarize: ")
        top_search_by_description(summary(cleanup(normalize(text))[0], normalize(text)))
        break
    else:
        choice = input("Invalid input format. Select one of the options below: ")

Overwriting ../my_module/Start.py


In [15]:
%%writefile ../my_module/normalize_text.py

import re

def normalize(text):
    """Normalizes text.

    Remove all .*?<[^>]+>© and newline symbol from text.

    Args:
        text: text in str format.

    Returns:
        text in str format without .*?<[^>]+>© and newline symbol.
    
    Example usage:
        test_text = normalize(text='Hello world!?')
    """
    tm1 = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    tm2 = re.sub('<[^>]+>©', '', tm1, flags=re.DOTALL)
    return tm2.replace("\n", "")

Overwriting ../my_module/normalize_text.py


In [16]:
%%writefile ../my_module/text_cleanup.py

import pandas as pd
import spacy
nlp = spacy.load('en_core_web_lg')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~©'
# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup(docs):
    """Text cleanup.

    Remove all punctuations and stopwords from text, also lemmatizes text.

    Args:
        text: text in str format.

    Returns:
        text in str format without all punctuations and stopwords, lemmatized.
    
    Example usage:
        test_text = cleanup(text='Hello world!?')
    """
    texts = []
    doc = nlp(docs, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    tokens = ' '.join(tokens)
    texts.append(tokens)
    return pd.Series(texts)

Overwriting ../my_module/text_cleanup.py


In [17]:
%%writefile ../my_module/generate_summary.py

from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

def summary(cleaned_text):
    """Summarizes text.

    Summarizes text according to TfidfVectorizer.

    Args:
        text: text in str format.

    Returns:
        text in str format sortedt and summarized according to TfidfVectorizer.
    
    Example usage:
        test_text = summary(text='Hello world!?')
    """
    tfidf_vectorizer = TfidfVectorizer()

    sentences = sent_tokenize(cleaned_text)

    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    scores = tfidf_matrix.toarray().sum(axis=0)
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    summary_sentences = [sentence for score, sentence in ranked_sentences[:3]]

    original_sentences = [sentences[sentences.index(summary_sentence)] for summary_sentence in summary_sentences]

    summary = ' '.join(original_sentences)

    return summary

Overwriting ../my_module/generate_summary.py


In [18]:
%%writefile ../my_module/search_by_description.py

import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize

model_downloaded = Doc2Vec.load("../models/d2v.model")
reviews_with_summary = pd.read_csv("../data/wine_reviews_with_summary.csv", index_col=0, encoding='utf-8')
reviews_with_summary['summary'] = reviews_with_summary['summary'].astype(str)

def top_search_by_description(description, amount = 5):
    """Select most relevant vines from dataset according to users description.

    Tokenized description and find most relevant vines from Dos2Vec dataset.

    Args:
        text: text in str format.
        amount: number of vines listed.

    Returns:
        print amount of vines relevant to description.
    
    Example usage:
        test_text = top_search_by_description(description='Hello world!?', amount = 5)
    """
    #to find the vector of a document which is not in training data
    description = word_tokenize(description)
    description_vec = model_downloaded.infer_vector(description)
    sims = model_downloaded.dv.most_similar([description_vec], topn=len(model_downloaded.dv))
    print('The most suitable wines according to the description:\n')
    for i in range(5):
        index = int(sims[i][0])
        acc = float(sims[i][1]) * 100
        print(f"Vine title: {reviews_with_summary['title'][index]}, vine variety: {reviews_with_summary['variety'][index]}"
            f", WineEnthusiast points: {reviews_with_summary['points'][index]:.0f}. Coincidence: {acc:.2f}%.")

Overwriting ../my_module/search_by_description.py
