# ***This file attempts at turning some of our seq2seq model summaries into natural language since we are getting very short two word summaries***

In [1]:
!pip install transformers



# ***Using GPT-2 with prompt***

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to the EOS token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

predicted_summaries = [
    "dog loves", "love", "great product", "great deal", "vegetarian"
]

expanded_summaries = []

for summary in predicted_summaries:
    prompt = f"Turn this product review into a complete sentence: {summary}"
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=50)
    
    # Generate a response to the prompt
    outputs = model.generate(**inputs, max_length=60, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    expanded_summaries.append(generated_text)
    print("Prompt: ", prompt)
    print("Generated Text: \n", generated_text)

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Prompt:  Turn this product review into a complete sentence: dog loves
Generated Text: 
 Turn this product review into a complete sentence: dog loves to play with toys.

The dog loves
Prompt:  Turn this product review into a complete sentence: love
Generated Text: 
 Turn this product review into a complete sentence: love it.

I've been using this product
Prompt:  Turn this product review into a complete sentence: great product
Generated Text: 
 Turn this product review into a complete sentence: great product.

I've been using this product for
Prompt:  Turn this product review into a complete sentence: great deal
Generated Text: 
 Turn this product review into a complete sentence: great deal.

I've been using this product for
Prompt:  Turn this product review into a complete sentence: vegetarian
Generated Text: 
 Turn this product review into a complete sentence: vegetarianism is a bad idea.

I'm


# ***Attempt at using a rule-based POS tagger for sentence generation***

In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def complete_sentence(phrase):
    words = word_tokenize(phrase)
    pos_tags = nltk.pos_tag(words)
    
    if len(words) == 1:
        if pos_tags[0][1] in ['NN', 'NNP']:
            return f"This product is called {words[0]}."
        elif pos_tags[0][1] == 'JJ':
            return f"This product is {words[0]}."
    
    elif len(words) == 2:
        if pos_tags[0][1] in ['NN', 'NNP'] and pos_tags[1][1] == 'VB':
            return f"Customers love the {words[0]}."
        elif pos_tags[0][1] == 'JJ' and pos_tags[1][1] in ['NN', 'NNP']:
            return f"This is a {phrase}."
        elif pos_tags[0][1] in ['NN', 'NNP'] and pos_tags[1][1] == 'JJ':
            return f"This {words[0]} is {words[1]}."
        elif 'deal' in words:
            return f"This product offers a {phrase}."
    
    # Special cases based on meaning
    if 'value' in words or 'delicious' in words:
        return f"Customers report this product is {phrase}."
    if words[-1] == 'product':
        return f"This is a {phrase}."

    # If none of the above rules apply, fall back to a default phrasing
    return f"This product is {phrase}."



expanded_summaries = [complete_sentence(summary) for summary in predicted_summaries]

print(expanded_summaries)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
['This product is dog loves.', 'This product is called love.', 'This is a great product.', 'This is a great deal.', 'This product is vegetarian.']
