<a href="https://colab.research.google.com/github/abhayalekal74/smart-question-generation/blob/master/QAG_GRU_Stable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download GloVe Embedding

In [0]:
glove_version = "glove.6B"
glove_version_specific = "glove.6B.300d"
EMBEDDING_DIM = 300

In [0]:
import requests
import zipfile
import io

url = "http://nlp.stanford.edu/data/{}.zip".format(glove_version)
r = requests.get(url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

# Download SQuAD

In [0]:
train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
dev_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"

r = requests.get(train_url)
open('train.json', 'wb').write(r.content)
r = requests.get(dev_url)
open('dev.json', 'wb').write(r.content)

4370528

# Extract Data

In [0]:
def get_csq(para_data):
    contexts, sentences, questions, questions_output = list(), list(), list(), list()
    for p in para_data:
        for c in p.contexts:
            for q in c.questions:
                questions.append(q.text)
                questions_output.append(q.output_question_text)
                sentences.append(q.sentence)
                contexts.append(c)
    return contexts, sentences, questions, questions_output

In [0]:
import sys
import json
from pprint import PrettyPrinter


class Paragraph:
    def __init__(self):
        self.contexts = list()


    def add_context(self, c):
        self.contexts.append(c)


class Context:
    def __init__(self, c):
        self.context = c
        self.questions = list()


    def add_question(self, q):
        self.questions.append(q)


class Question:
    def __init__(self, q, s, a):
        self.text = q
        self.output_question_text = " ".join(q.split()[1:])
        self.is_impossible = False
        self.sentence = s
        self.answers = a

def extract_data(file_name):
    all_para_data = list()
    with open(file_name, 'r') as f:
        paragraphs = json.load(f)['data']
    for p in paragraphs:
        para_obj = Paragraph()
        for c in p['paragraphs']:
            context_text = c['context']
            context_obj = Context(context_text)
            for qa in c['qas']:
                answers = list()
                sentence = ''
                for a in qa['answers']:
                    ans_text = a['text']
                    if not sentence:
                        offset = a['answer_start']
                        try:
                            sen_start_index = context_text[:offset].rindex('.')
                        except:
                            sen_start_index = 0 
                        ans_end_index = offset + len(ans_text)
                        try:
                            sen_end_index = context_text[ans_end_index:].index('.')
                        except:
                            sen_end_index = len(context_text)
                        sentence = context_text[sen_start_index: sen_end_index + 1]
                    answers.append(ans_text)
                ques_obj = Question(qa['question'], sentence, answers)
                context_obj.add_question(ques_obj)
            para_obj.add_context(context_obj)
        all_para_data.append(para_obj)
    return all_para_data

In [0]:
train_data = extract_data('train.json')
contexts, sentences, questions, questions_output = get_csq(train_data)

# Create Embedding Dictionary

In [0]:
import numpy as np

embeddings = dict()

embeddings_file = '{}.txt'.format(glove_version_specific)

with open(embeddings_file, 'r') as f:
    for l in f.readlines():
        words = l.split()
        embeddings[words[0]] = np.asarray(words[1:], dtype=np.float32)
    embeddings['<unk>'] = np.random.rand(EMBEDDING_DIM)

In [0]:
def get_embedding(token):
    try:
        return embeddings[token]
    except KeyError:
        return embeddings['<unk>']

# Data Preprocessing

In [0]:
import re

def preprocess(text):
    tokens = text.lower().split()
    for i in range(len(tokens)):
        tokens[i] = re.sub('(^[^a-z0-9]+(?=[a-z0-9]))|((?<=[a-z0-9])[^a-z0-9]+$)', '',  tokens[i])
    return len(tokens), " ".join(tokens)

# Tokenize


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def fit_tokenizer(texts, max_most_frequent_words):
    tokenizer = Tokenizer(num_words=max_most_frequent_words, filters='')
    tokenizer.fit_on_texts(texts)
    return tokenizer


def get_padded_sequences_from_texts(texts, tokenizer, max_seq_length):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=max_seq_length)

In [0]:
# Get preprocessed contexts, sentences and questions
# Get the max length of sequences in all three

def get_preprocessed_data_and_max_seq_len(data):
    preprocessed_data = [preprocess(x) for x in data]
    max_len = max(d[0] for d in preprocessed_data)
    return max_len, [d[1] for d in preprocessed_data]


#TODO remove
TRIM_LENGTH = 500

context_max_len, contexts_preprocessed = get_preprocessed_data_and_max_seq_len([c.context for c in contexts[:TRIM_LENGTH]])
sentence_max_len, sentences_preprocessed = get_preprocessed_data_and_max_seq_len(sentences[:TRIM_LENGTH]) 
question_max_len, questions_preprocessed = get_preprocessed_data_and_max_seq_len(questions[:TRIM_LENGTH])
_, questions_output_preprocessed = get_preprocessed_data_and_max_seq_len(questions_output[:TRIM_LENGTH])

In [0]:
print (context_max_len, sentence_max_len, question_max_len)

326 77 22


In [0]:

# Use only the most frequent MAX_SRC_UNIQ_WORDS from src (contexts, sentences) data
# Use only the most frequent MAX_TAR_UNIQ_WORDS from tar (questions) data

MAX_SRC_UNIQ_WORDS = 10000
MAX_TAR_UNIQ_WORDS = 7000

# Fit two different tokenizers on src and tar data
src_tokenizer = fit_tokenizer([c.context for c in contexts], MAX_SRC_UNIQ_WORDS)
tar_tokenizer = fit_tokenizer(questions, MAX_TAR_UNIQ_WORDS)

MAX_SRC_UNIQ_WORDS = min(MAX_SRC_UNIQ_WORDS, len(src_tokenizer.word_index) + 1)
MAX_TAR_UNIQ_WORDS = min(MAX_TAR_UNIQ_WORDS, len(tar_tokenizer.word_index) + 1)


# Pad the sequences to max_seq_length in corresponding data
# context_sequences = get_padded_sequences_from_texts(contexts_preprocessed, src_tokenizer, context_max_len)
# sentence_sequences = get_padded_sequences_from_texts(sentences_preprocessed, src_tokenizer, sentence_max_len)
# question_input_sequences = get_padded_sequences_from_texts(questions_preprocessed, tar_tokenizer, question_max_len)
# question_output_sequences = get_padded_sequences_from_texts(questions_output_preprocessed, tar_tokenizer, question_max_len)

# SRC and TAR embeddings matrix

In [0]:
# Create embedding matrix for uniq words from src and tar tokenizer.word_index

# def get_embedding_matrix(word_index, MAX_WORDS):
#     max_uniq_words = min(MAX_WORDS, len(word_index) + 1)
#     embeddings_matrix = np.zeros((max_uniq_words, EMBEDDING_DIM))
#     for word, i in word_index.items():
#         if i >= max_uniq_words:
#             break
#         word_embedding = get_embedding(word)
#         embeddings_matrix[i] = word_embedding
#     return embeddings_matrix

def get_embedding_matrix(word_index, MAX_WORDS):
    embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_WORDS:
            continue
        embedding_vector = get_embedding(word)
        embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [0]:
src_emb_matrix = get_embedding_matrix(src_tokenizer.word_index, MAX_SRC_UNIQ_WORDS)
tar_emb_matrix = get_embedding_matrix(tar_tokenizer.word_index, MAX_TAR_UNIQ_WORDS)

print (src_emb_matrix.shape, tar_emb_matrix.shape)

(10000, 300) (7000, 300)


# Without Embedding Layers

In [0]:
EXTRA_WORDS = 3

def get_word_index_map(word_index, MAX_WORDS):
    word_index_map, index_word_map = dict(), dict()
    for word, i in word_index.items():
        if i >= MAX_WORDS:
            continue
        else:
            word_index_map[word] = i
            index_word_map[i] = word
    word_index_map['<unk>'] = MAX_WORDS
    word_index_map['<SOS>'] = MAX_WORDS + 1
    word_index_map['<EOS>'] = MAX_WORDS + 2
    index_word_map[MAX_WORDS] = '<unk>'
    index_word_map[MAX_WORDS + 1] = '<SOS>'
    index_word_map[MAX_WORDS + 2] = '<EOS>'
    return word_index_map, index_word_map

In [0]:
src_word_index_map, src_index_word_map = get_word_index_map(src_tokenizer.word_index, MAX_SRC_UNIQ_WORDS)
tar_word_index_map, tar_index_word_map = get_word_index_map(tar_tokenizer.word_index, MAX_TAR_UNIQ_WORDS)

In [0]:
def get_index_of_token(word_index_map, token):
    try:
        return word_index_map[token]
    except KeyError:
        return word_index_map['<unk>']

def get_token_at_index(index_word_map, index):
    try:
        return index_word_map[index]
    except KeyError:
        return index_word_map[1000]

**Preparing Input Data**

In [0]:
# Allocate arrays for inputs

EXTRA_WORDS_ADDED_TO_SENTENCES = 2

ctx_encoder_input_data = np.zeros(
    (len(contexts_preprocessed), context_max_len, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS),
    dtype='float32')
encoder_input_data = np.zeros(
    (len(sentences_preprocessed), sentence_max_len + EXTRA_WORDS_ADDED_TO_SENTENCES, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(questions_preprocessed), question_max_len + EXTRA_WORDS_ADDED_TO_SENTENCES, MAX_TAR_UNIQ_WORDS + EXTRA_WORDS),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(questions_preprocessed), question_max_len + EXTRA_WORDS_ADDED_TO_SENTENCES, MAX_TAR_UNIQ_WORDS + EXTRA_WORDS),
    dtype='float32')

for i in range(len(sentences_preprocessed)):
    context = contexts_preprocessed[i]
    sentence = "{} {} {}".format('<SOS>', sentences_preprocessed[i], '<EOS>')
    question = "{} {} {}".format('<SOS>', questions_preprocessed[i], '<EOS>')
    for t, token in enumerate(context.split()):
        ctx_encoder_input_data[i, t, get_index_of_token(src_word_index_map, token)] = 1.
    for t, token in enumerate(sentence.split()):
        encoder_input_data[i, t, get_index_of_token(src_word_index_map, token)] = 1.
    for t, token in enumerate(question.split()):
        decoder_input_data[i, t, get_index_of_token(tar_word_index_map, token)] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, get_index_of_token(tar_word_index_map, token)] = 1.

**Training Model**

In [0]:
from keras.models import Model
from keras.layers import LSTM, LSTMCell, RNN, Dense, Input, Bidirectional, Dropout, Add, Concatenate, TimeDistributed
from keras.layers import Dropout
import tensorflow as tf
from keras import optimizers
from keras.callbacks import EarlyStopping

In [0]:

LSTM_CELLS = 600

# ctx_encoder_inputs = Input(shape=(None, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS), name='ctx_encoder_input_1')
# # ctx_enc_1 = Bidirectional(LSTM(LSTM_CELLS, name='ctx_LSTM_1', return_sequences=True))
# ctx_enc_1 = LSTM(LSTM_CELLS, name='ctx_LSTM_1', return_sequences=True)
# ctx_out_1 = ctx_enc_1(ctx_encoder_inputs)
# ctx_drop_1 = Dropout(0.3, name='ctx_Drop_1')(ctx_out_1)
# # _, ctx_forward_h, ctx_forward_c, ctx_backward_h, ctx_backward_c = Bidirectional(LSTM(LSTM_CELLS, return_state=True, return_sequences=True, name='ctx_LSTM_2'))(ctx_drop_1)
# _, ctx_state_h, ctx_state_c = LSTM(LSTM_CELLS, return_state=True, return_sequences=True, name='ctx_LSTM_2')(ctx_drop_1)
# # ctx_enc_states = [ctx_state_h, ctx_state_c]

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS), name='encoder_input_1')
# encoder_1 = Bidirectional(LSTM(LSTM_CELLS, name='enc_LSTM_1', return_sequences=True), name='enc_Bidirectional_1')
encoder_1 = GRU(LSTM_CELLS, name='enc_LSTM_1', return_sequences=True)
encoder_1_outputs = encoder_1(encoder_inputs)
enc_dropout_1 = Dropout(0.3, name='enc_Dropout_1')(encoder_1_outputs)
# encoder_output, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(LSTM_CELLS, return_state=True, return_sequences=True, name='enc_LSTM_2'), name='enc_Bidirectional_2')(encoder_dropout_1)
encoder_output, encoder_state = GRU(LSTM_CELLS, return_state=True, return_sequences=True, name='enc_LSTM_2')(enc_dropout_1)
# We discard `encoder_outputs` and only keep the states.
# ctx_h = Add()([ctx_forward_h, ctx_backward_h])
# ctx_c = Add()([ctx_forward_c, ctx_backward_c])

# h = Add()([forward_h, backward_h])
# c = Add()([forward_c, backward_c])

# state_h = Concatenate()([ctx_h, h])
# state_c = Concatenate()([ctx_c, c])
# state_h = Concatenate()([ctx_state_h, state_h])
# state_c = Concatenate()([ctx_state_c, state_c])
# encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, MAX_TAR_UNIQ_WORDS + EXTRA_WORDS), name='decoder_input_1')
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm_1 = GRU(LSTM_CELLS, return_sequences=True, return_state=True, name='decoder_LSTM_1')
# decoder_lstm_1.cell.setAttentionMode(True)
decoder_1_outputs, _ = decoder_lstm_1(inputs=decoder_inputs, initial_state=encoder_state)

# attn_out, attn_states = AttentionLayer(name='attention_layer')([encoder_output, decoder_1_outputs])
# decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_1_outputs, attn_out])

decoder_dense = TimeDistributed(Dense(MAX_TAR_UNIQ_WORDS + EXTRA_WORDS, activation='softmax', name='decoder_Dense_1'))
decoder_outputs = decoder_dense(decoder_1_outputs)



# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
# model = Model([ctx_encoder_inputs, encoder_inputs, decoder_inputs], decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# model.fit([ctx_encoder_input_data, encoder_input_data, decoder_input_data], decoder_target_data, batch_size=32, epochs=50)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=10, callbacks=[es])
# Save model
model.save('qag.h5')

**Inference**

In [0]:
decoder_model = None

encoder_model = Model(encoder_inputs, encoder_state)

decoder_state_input = Input(shape=(LSTM_CELLS,))
decoder_outputs, decoder_state = decoder_lstm_1(
    decoder_inputs, initial_state=decoder_state_input)
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs, decoder_state_input],
    [decoder_outputs, decoder_state])

In [0]:
def generate_question(ctx_input, input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict([input_seq])

    # Generate empty target sequence of length 1 word.
    target_seq = np.zeros((1, 1, MAX_TAR_UNIQ_WORDS + EXTRA_WORDS))
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0, get_index_of_token(tar_word_index_map, '<SOS>')] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    question_generated = list()
    question_generated.append('<SOS>')
    while not stop_condition:
        output_tokens, state = decoder_model.predict(
            [target_seq, states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = get_token_at_index(tar_index_word_map, sampled_token_index)
        question_generated.append(sampled_token)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_token == '<EOS>' or len(question_generated) > question_max_len):
            stop_condition = True

        # Update the target sequence.
        target_seq = np.zeros((1, 1, MAX_TAR_UNIQ_WORDS + EXTRA_WORDS))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = state

    return " ".join(question_generated)

In [0]:
print (ctx_encoder_input_data[0:1].shape, encoder_input_data[0:1].shape)
ctx_input = np.random.rand(1,326,10003)
input_seq = np.random.rand(1,79,10003)
question_generated = generate_question(ctx_input, input_seq)
print('-')
print('Input sentence:', input_seq)
print('Decoded sentence:', question_generated)



# ctx_input = ctx_encoder_input_data[1:2]
# input_seq = encoder_input_data[1:2]
# question_generated = generate_question(ctx_input, input_seq)
# print('-')
# print('Input sentence:', input_seq)
# print('Decoded sentence:', question_generated)

In [0]:
def test(context, sentences):
    context = preprocess(context)
    test_sen_preprocessed = [preprocess(x) for x in sentences]
    ctx_input = np.zeros((
            1, context_max_len, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS
        ))
    for i in range(len(test_sen_preprocessed)):
        input_data = np.zeros((
            1, sentence_max_len + EXTRA_WORDS_ADDED_TO_SENTENCES, MAX_SRC_UNIQ_WORDS + EXTRA_WORDS
        ))
        sentence = "{} {} {}".format("<SOS>", test_sen_preprocessed[i][1], "<EOS>")
        print ("Input Sentence:", sentence)
        for t, token in enumerate(sentence.split()):
            input_data[0, t, get_index_of_token(src_word_index_map, token)] = 1.
        # ctx_input = np.random.rand(1,326,10003)
        question_generated = generate_question(ctx_input, input_data)
        print ("Question Generated:", question_generated)
        print ()


In [0]:
# Model (stable) Output before change
context = "Beyoncé Giselle Knowles-Carter is an American singer, songwriter and actress. Born and raised in Houston, Texas, Beyoncé performed in various singing and dancing competitions as a child. She rose to fame in the late 1990s as lead singer of the R&B girl-group Destiny's Child, one of the best-selling girl groups in history. Their hiatus saw the release of her first solo album, Dangerously in Love (2003), which debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards. The album also featured the US Billboard Hot 100 number-one singles Crazy in Love and Baby Boy. Following the break-up of Destiny's Child in 2006, she released her second solo album, B'Day, which contained her fourth solo number-one song in the US, Irreplaceable. Beyoncé also continued her acting career with starring roles in The Pink Panther (2006), Dreamgirls (2006), and Obsessed (2009). Her marriage to rapper Jay-Z and her portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the introduction of her alter-ego, Sasha Fierce, and earned a record-setting six Grammy Awards in 2010, including Song of the Year for Single Ladies (Put a Ring on It). After splitting from her manager and father Mathew Knowles in 2010, she released 4 in 2011, which explored a mellower tone and was influenced by 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed eponymous album, released in 2013 with no prior announcement, was even more experimental in its production and exploration of darker themes. Lemonade (2016), one of her most personal and political work to date, received widespread critical acclaim and became the best-selling album of 2016. In 2018, she released Everything Is Love, a collaborative album with her husband, Jay-Z, as The Carters. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, With the release of Lemonade, Beyoncé became the first and only musical act in Billboard chart history to debut at number one with their first six solo studio albums"
sentences = [
             "Beyoncé Giselle Knowles-Carter is an American singer, songwriter and actress. Born and raised in Houston, Texas, Beyoncé performed in various singing and dancing competitions as a child",
             "She rose to fame in the late 1990s as lead singer of the R&B girl-group Destiny's Child, one of the best-selling girl groups in history",
             "Their hiatus saw the release of her first solo album, Dangerously in Love (2003), which debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards. The album also featured the US Billboard Hot 100 number-one singles Crazy in Love and Baby Boy",
             "Following the break-up of Destiny's Child in 2006, she released her second solo album, B'Day, which contained her fourth solo number-one song in the US, Irreplaceable",
             "Beyoncé also continued her acting career with starring roles in The Pink Panther (2006), Dreamgirls (2006), and Obsessed (2009)",
             "Her marriage to rapper Jay-Z and her portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the introduction of her alter-ego, Sasha Fierce, and earned a record-setting six Grammy Awards in 2010, including Song of the Year for Put a Ring on It",
             "After splitting from her manager and father Mathew Knowles in 2010, she released 4 in 2011, which explored a mellower tone and was influenced by 1970s funk, 1980s pop, and 1990s soul",
             "Her critically acclaimed eponymous album, released in 2013 with no prior announcement, was even more experimental in its production and exploration of darker themes",
             "Lemonade (2016), one of her most personal and political work to date, received widespread critical acclaim and became the best-selling album of 2016. In 2018, she released Everything Is Love, a collaborative album with her husband, Jay-Z, as The Carters"
]

test(context, sentences)

Input Sentence: <SOS> beyonc giselle knowles-carter is an american singer songwriter and actress born and raised in houston texas beyonc performed in various singing and dancing competitions as a child <EOS>
Question Generated: <SOS> what did beyoncé's mother own when <unk> was a child <EOS>

Input Sentence: <SOS> she rose to fame in the late 1990s as lead singer of the r&b girl-group destiny's child one of the best-selling girl groups in history <EOS>
Question Generated: <SOS> what was the name of beyoncé's first solo album <EOS>

Input Sentence: <SOS> their hiatus saw the release of her first solo album dangerously in love 2003 which debuted at number one on the us billboard 200 chart and earned her five grammy awards the album also featured the us billboard hot 100 number-one singles crazy in love and baby boy <EOS>
Question Generated: <SOS> what did beyoncé's mother own when <unk> was a child <EOS>

Input Sentence: <SOS> following the break-up of destiny's child in 2006 she release

In [0]:
# New model

sentences = [
             "Beyoncé Giselle Knowles-Carter is an American singer, songwriter and actress. Born and raised in Houston, Texas, Beyoncé performed in various singing and dancing competitions as a child",
             "She rose to fame in the late 1990s as lead singer of the R&B girl-group Destiny's Child, one of the best-selling girl groups in history",
             "Their hiatus saw the release of her first solo album, Dangerously in Love (2003), which debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards. The album also featured the US Billboard Hot 100 number-one singles Crazy in Love and Baby Boy",
             "Following the break-up of Destiny's Child in 2006, she released her second solo album, B'Day, which contained her fourth solo number-one song in the US, Irreplaceable",
             "Beyoncé also continued her acting career with starring roles in The Pink Panther (2006), Dreamgirls (2006), and Obsessed (2009)",
             "Her marriage to rapper Jay-Z and her portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the introduction of her alter-ego, Sasha Fierce, and earned a record-setting six Grammy Awards in 2010, including Song of the Year for Put a Ring on It",
             "After splitting from her manager and father Mathew Knowles in 2010, she released 4 in 2011, which explored a mellower tone and was influenced by 1970s funk, 1980s pop, and 1990s soul",
             "Her critically acclaimed eponymous album, released in 2013 with no prior announcement, was even more experimental in its production and exploration of darker themes",
             "Lemonade (2016), one of her most personal and political work to date, received widespread critical acclaim and became the best-selling album of 2016. In 2018, she released Everything Is Love, a collaborative album with her husband, Jay-Z, as The Carters"
]

test(sentences)