In [21]:
import random
import markovify
from datasets import load_dataset
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer

In [22]:
# 1. Generare în limba română: Implementați un sistem care transformă un text (corpus) într-un lanț Markov și 
# folosiți-l pentru a generare un proverb sau o poezie în limba română (folosiți fișierele proverbRo.txt sau poezieRo.txt)

In [23]:
# Varianta 1 – Implementați un lanț Markov cu o singură stare sau

In [24]:
# Read the text
with open('data/proverbe.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [25]:
# Markov chain
words = text.split()
chain = {}
for i in range(len(words) - 1):
    current_word = words[i]
    next_word = words[i + 1]
    if current_word in chain:
        chain[current_word].append(next_word)
    else:
        chain[current_word] = [next_word]

In [26]:
# Generate a proverb using the Markov chain
length = 8
current_word = random.choice(list(chain.keys())).strip('.')
generated_text = [current_word]
for _ in range(length - 1):
    if current_word in chain:
        next_word = random.choice(chain[current_word])
        next_word = next_word.strip('.')
        generated_text.append(next_word)
        current_word = next_word
    else:
        break
generated_text[-1] += '.'
proverb = ' '.join(generated_text)

print("Proverb:")
print(proverb)

Proverb:
Pe cine nu locul nu iese soarele pe.


In [27]:
# Varianta 2 – Implementați un lanț Markov cu n-stări

In [28]:
# Read the text
with open('data/proverbe.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [29]:
# Markov chain n states
n = 4

words = text.split()
chain = {}
for i in range(len(words) - n):
    # Extract the current state
    current_state = tuple(words[i:i + n])
    # Get the next word after the current state
    next_word = words[i + n]
    # Update the chain dictionary with the current state and next word
    if current_state in chain:
        chain[current_state].append(next_word)
    else:
        chain[current_state] = [next_word]


In [30]:
# Generate using Markov chain - n states
length = 9
current_state = random.choice(list(chain.keys()))
generated_text = list(current_state)
for _ in range(length - n):
    if current_state in chain:
        # Randomly select the next word from the list of values associated with the current state
        next_word = random.choice(chain[current_state])
        generated_text.append(next_word)
        current_state = tuple(generated_text[-n:])
    else:
        break
generated_text = [word.strip('.') for word in generated_text]
if not generated_text[-1].endswith('.'):
    generated_text[-1] += '.'
proverb = ' '.join(generated_text)


print("Proverb")
print(proverb)

Proverb
zice popa Nu haina il face pe om Nu.


In [31]:
# 2. Generare în limba engleză:

In [32]:
# a. Folosiți biblioteca markovify (sau implementarea voastră de la problema 1) pentru a genera o strofă de poezie în limba engleză folosind unul din următoarele corpus-uri (sau orice altă sursă găsiți voi):

# https://huggingface.co/datasets/biglam/gutenberg-poetry-corpus
# https://github.com/tnhaider/english-gutenberg-poetry
# https://www.shakespeares-sonnets.com/all.php

In [33]:
# Get the poetry dataset
dataset = load_dataset("biglam/gutenberg-poetry-corpus")

poetry_text = ""
if "train" in dataset:
    poetry_data = dataset["train"]
    poetry_text = "\n".join(poetry_data["line"])
else:
    print("Failed to load data")
    poetry_text = None


In [34]:
# Train a Markov model
if poetry_text:
    text_model = markovify.NewlineText(poetry_text, state_size=2)
else:
    print("Failed to generate")

In [41]:
# Generate a poetry
gen_poetry = ""

gen_poetry = '\n'.join([text_model.make_sentence(tries=100) for _ in range(3)])  
if not gen_poetry.endswith((".", "!", "?", ";")):
    gen_poetry += "."

print("Generated:")
print(gen_poetry)

Generated:
From the new year come,
With a pale, green moon is a-bloom in the sombre rafters, that round him first
Bright hopes, that erst by Cato's foot was moved with wrath,.


In [42]:
# b. Calculați emoția textului generat, puteți folosi una din următoarele resurse:

# Natural Language Toolkit (nltk) SentimentIntensityAnalyzer
# TextBlob sentiment

In [43]:
# Sentiment analysis
blob = TextBlob(text)
sentiment_polarity = blob.sentiment.polarity
print("Sentiment Polarity:", sentiment_polarity)

Sentiment Polarity: -0.80078125


In [44]:
# c. Pentru a adresa limitările de creativitate în poezia generată înlocuiți aleator cuvinte cu sinonime. 
# Se cere ca sinonimele să fie obținute folosind embedding-uri. (i.e. Cuvântul ales e transformat în forma sa 
# embedded și se alege embedding-ul cel mai apropiat care este convertit la string)

In [45]:
import spacy
import numpy as np
from nltk.corpus import wordnet
from nltk.corpus import wordnet_ic

# Încărcăm modelul spaCy cu embeddings
nlp = spacy.load("en_core_web_md")

In [46]:
def find_synonym(word):
    last_similarity = 0.3
    synonym = None
    word_embedding = nlp(word).vector
    
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            lemma_embedding = nlp(lemma.name()).vector
            if np.count_nonzero(word_embedding) == 0 or np.count_nonzero(lemma_embedding) == 0:
                continue
            similarity = np.dot(word_embedding, lemma_embedding) / (np.linalg.norm(word_embedding) * np.linalg.norm(lemma_embedding))
            if similarity > last_similarity:
                last_similarity = similarity
                synonym = lemma.name()
    
    return synonym

In [47]:
def replace_with_synonyms(poetry):
    new_poetry = []
    for verse in poetry.split('\n'):
        for word in verse.split():
            if wordnet.synsets(word):
                synonym = find_synonym(word)
                new_poetry.append(synonym)
            else:
                new_poetry.append(word)
            new_poetry.append(" ")
        new_poetry.append('\n')
    return "".join(new_poetry)

In [48]:
print("Poetry:")
print(gen_poetry)

new_poetry = replace_with_synonyms(gen_poetry)

print("\nNew poetry:")
print(new_poetry)

Poetry:
From the new year come,
With a pale, green moon is a-bloom in the sombre rafters, that round him first
Bright hopes, that erst by Cato's foot was moved with wrath,.

New poetry:
From the new year come, 
With a pale, green moon be a-bloom in the sombre rafters, that round him first 
bright hopes, that erst by Cato's foot be moved with wrath,. 



In [49]:
# d. Salvați poezia care vi se pare cea mai reușită si trimiteti-o unui prieten.

In [50]:
# e. Calculați metrica BLEU (Bilingual Evaluation Understudy Score) pentru poezia aleasă

In [51]:
def n_gram_generator(sentence,n= 2,n_gram= False):
    '''
    N-Gram generator with parameters sentence
    n is for number of n_grams
    The n_gram parameter removes repeating n_grams 
    '''
    sentence = sentence.lower() # converting to lower case
    sent_arr = np.array(sentence.split()) # split to string arrays
    length = len(sent_arr)

    word_list = []
    for i in range(length+1):
        if i < n:
            continue
        word_range = list(range(i-n,i))
        s_list = sent_arr[word_range]
        string = ' '.join(s_list) # converting list to strings
        word_list.append(string) # append to word_list
        if n_gram:
            word_list = list(set(word_list))
    return word_list

In [52]:
import numpy as np
from collections import Counter  # Add this line
import math

def bleu_score(original,machine_translated):
    '''
    Bleu score function given a orginal and a machine translated sentences
    '''
    mt_length = len(machine_translated.split())
    o_length = len(original.split())

    # Brevity Penalty 
    if mt_length>o_length:
        BP=1
    else:
        penality=1-(mt_length/o_length)
        BP=np.exp(penality)

    # Clipped precision
    clipped_precision_score = []
    for i in range(1, 5):
        original_n_gram = Counter(n_gram_generator(original,i))
        machine_n_gram = Counter(n_gram_generator(machine_translated,i))

        c = sum(machine_n_gram.values())
        for j in machine_n_gram:
            if j in original_n_gram:
                if machine_n_gram[j] > original_n_gram[j]:
                    machine_n_gram[j] = original_n_gram[j]
            else:
                machine_n_gram[j] = 0

        #print (sum(machine_n_gram.values()), c)
        clipped_precision_score.append(sum(machine_n_gram.values())/c)

    #print (clipped_precision_score)

    weights =[0.25]*4

    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, clipped_precision_score))
    s = BP * math.exp(math.fsum(s))
    return s

original = """
My last thought is yet unknown;
And venerable walls against the dusty floor
To smile, to greet the dewy spring;
"""
machine_translated = """
My last thought remains unknown;
And venerable walls against the dusty floor
Smile, greeting the dewy spring;
"""

print (bleu_score(original, machine_translated))


0.6773811491339269
