# Deep Learning-powered NLP: Customized Text Generation with Control

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils 
import tensorflow as tf
import numpy as np 
import pandas as pd





In [2]:
!pip install tensorflow 

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.15.0
  Downloading tensorflow_intel-2.15.0-cp310-cp310-win_amd64.whl (300.9 MB)
     -------------------------------------- 300.9/300.9 MB 2.2 MB/s eta 0:00:00
Collecting termcolor>=1.1.0
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
     -------------------------------------- 442.0/442.0 kB 3.5 MB/s eta 0:00:00
Collecting absl-py>=1.0.0
  Downloading absl_py-2.0.0-py3-none-any.whl (130 kB)
     -------------------------------------- 130.2/130.2 kB 7.5 MB/s eta 0:00:00
Collecting keras<2.16,>=2.15.0
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
     -------------------------



In [9]:
df = pd.read_csv('Shakespeare_data.csv')
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [10]:
import csv

corpus = []

with open('Shakespeare_data.csv') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)  # Skip the header row
    for row in reader:
        corpus.append(row[5])

print("Total lines in the corpus:", len(corpus))
print("Sample of the corpus:", corpus[:3])


Total lines in the corpus: 111396
Sample of the corpus: ['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others']


In [11]:
import string

# Display the column names in the DataFrame
print("Column names in the DataFrame:", df.columns)

column_name = 'CombinedLine'

if column_name in df.columns:
    playLine = df[column_name]

    def clean_text(txt):
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        txt = txt.encode("utf8").decode("ascii", 'ignore')
        return txt 

    corpus = [clean_text(x) for x in playLine]
    print("First 10 cleaned lines:")
    print(corpus[:10])  # Display the first 10 cleaned lines
else:
    print(f"Column '{column_name}' not found in the DataFrame.")


Column names in the DataFrame: Index(['Dataline', 'Play', 'PlayerLinenumber', 'ActSceneLine', 'Player',
       'PlayerLine'],
      dtype='object')
Column 'CombinedLine' not found in the DataFrame.


In [12]:
import string

def text_cleaner(text):
    text = "".join(car for car in text if car not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    return text

corpus = [text_cleaner(line) for line in corpus]

In [13]:
# Tokenization is the process of splitting up a text into a list of individual words, or tokens.
corpus = corpus[:5000]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_words = len(word_index) + 1
total_words

5411

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
token_list = tokenizer.texts_to_sequences(["I am happy to see you here today"])[0]
print(token_list)

check=[]

for i in range(1, len(token_list)):
  n_gram_sequence = token_list[:i+1]
  check.append(n_gram_sequence)

check

[4, 57, 867, 5, 92, 9, 54, 405]


[[4, 57],
 [4, 57, 867],
 [4, 57, 867, 5],
 [4, 57, 867, 5, 92],
 [4, 57, 867, 5, 92, 9],
 [4, 57, 867, 5, 92, 9, 54],
 [4, 57, 867, 5, 92, 9, 54, 405]]

In [15]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1    
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

inp_sequences[:10], total_words

([[495, 4],
  [153, 4],
  [153, 4, 301],
  [153, 4, 301, 1],
  [153, 4, 301, 1, 792],
  [60, 50],
  [60, 50, 93],
  [60, 50, 93, 33],
  [60, 50, 93, 33, 117],
  [60, 50, 93, 33, 117, 3]],
 5411)

In [16]:
# Combine Player and PlayerLine for simplicity (you can adjust this based on your needs)
df['CombinedLine'] = df['Player'].astype(str) + ': ' + df['PlayerLine'].astype(str)

# Drop rows with missing values in the 'CombinedLine' column
df = df.dropna(subset=['CombinedLine'])

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['CombinedLine'])
total_words = len(tokenizer.word_index) + 1


In [17]:
# Tokenize the text with a maximum number of words
tokenizer = Tokenizer(num_words=2000)  
tokenizer.fit_on_texts(df['CombinedLine'])
total_words = min(2000, len(tokenizer.word_index) + 1)

# Create input sequences and labels
input_sequences = []
for line in df['CombinedLine']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tensorflow.keras.utils.to_categorical(y, num_classes=total_words)


In [18]:
from tensorflow.compat.v1.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create your model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=10, activation='softmax'))

# Compile your model
model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(), loss='categorical_crossentropy', metrics=['accuracy'])






In [19]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display a summary of the model architecture
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 138, 100)          200000    
                                                                 
 bidirectional (Bidirection  (None, 138, 200)          160800    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 138, 200)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               120400    
                                                                 
 dense_2 (Dense)             (None, 2000)              202000    
                                                                 
Total params: 683200 (2.61 MB)
Trainable params: 683200 (2.61 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [20]:
# Train the model
model.fit(X, y, epochs=1, verbose=1)






<keras.src.callbacks.History at 0x1625b461390>

In [23]:
def generate_text(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probabilities = model.predict(token_list, verbose=0)
        
        # Get the index of the word with the maximum probability
        predicted_index = np.argmax(predicted_probabilities)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        seed_text += " " + output_word

    return seed_text


In [24]:
# Generate text
generated_text = generate_text("JULIET", 10, model, max_sequence_length)
print(generated_text)


JULIET and i am not a man and a of the


In [25]:
#User Interface 
class TextGenerator:
    def __init__(self, style_options, tone_options, content_options):
        self.style_options = style_options
        self.tone_options = tone_options
        self.content_options = content_options

    def generate_text(self, user_input):
        # Extract user-specified attributes
        selected_style = user_input.get('style', 'default')
        selected_tone = user_input.get('tone', 'neutral')
        selected_content = user_input.get('content', 'generic')

        generated_text = self.generate_custom_text(selected_style, selected_tone, selected_content)

        return generated_text

    def generate_custom_text(self, style, tone, content):

        return f"Generated text with style: {style}, tone: {tone}, and content: {content}"

text_generator = TextGenerator(style_options=['formal', 'casual', 'creative'],
                               tone_options=['neutral', 'positive', 'formal'],
                               content_options=['generic', 'specific', 'creative'])

user_input = {'style': 'casual', 'tone': 'positive', 'content': 'specific'}

generated_text = text_generator.generate_text(user_input)

# Display the generated text
print(generated_text)


Generated text with style: casual, tone: positive, and content: specific


In [26]:
class TextGenerator:
    def __init__(self, style_options, tone_options, content_options):
        self.style_options = style_options
        self.tone_options = tone_options
        self.content_options = content_options

    def generate_text(self, user_input):
        # Extract user-specified attributes
        selected_style = user_input.get('style', 'default')
        selected_tone = user_input.get('tone', 'neutral')
        selected_content = user_input.get('content', 'generic')

        generated_text = self.generate_custom_text(selected_style, selected_tone, selected_content)

        return generated_text

    def generate_custom_text(self, style, tone, content):

        return f"Generated text with style: {style}, tone: {tone}, and content: {content}"

text_generator = TextGenerator(style_options=['formal', 'casual', 'creative'],
                               tone_options=['neutral', 'positive', 'formal'],
                               content_options=['generic', 'specific', 'creative'])

user_input = {'style': 'casual', 'tone': 'positive', 'content': 'specific'}

generated_text = text_generator.generate_text(user_input)

print(generated_text)


Generated text with style: casual, tone: positive, and content: specific


In [27]:
pip install ipywidgets

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [32]:
# User input form
from ipywidgets import interact, widgets

@interact(style=['formal', 'casual', 'creative'],
          tone=['neutral', 'positive', 'formal'],
          content=['generic', 'specific', 'creative'])
def generate_text(style, tone, content):
    user_input = {'style': style, 'tone': tone, 'content': content}
    generated_text = text_generator.generate_text(user_input)
    print(generated_text)


interactive(children=(Dropdown(description='style', options=('formal', 'casual', 'creative'), value='formal'),…

In [34]:
import ipywidgets as widgets
from IPython.display import display

class TextGenerator:
    def __init__(self, style_options, tone_options, content_options):
        self.style_options = style_options
        self.tone_options = tone_options
        self.content_options = content_options

    def generate_text(self, user_input):
        selected_style = user_input.get('style', 'default')
        selected_tone = user_input.get('tone', 'neutral')
        selected_content = user_input.get('content', 'generic')
        generated_text = self.generate_custom_text(selected_style, selected_tone, selected_content)
        return generated_text

    def generate_custom_text(self, style, tone, content):
        return f"Generated text with style: {style}, tone: {tone}, and content: {content}"

class TextAnalyzer:
    def analyze_text(self, input_text):

        style = 'casual'
        tone = 'neutral'
        content = 'generic'
        return {'style': style, 'tone': tone, 'content': content}

# Create an instance of the TextGenerator and TextAnalyzer classes
text_generator = TextGenerator(style_options=['formal', 'casual', 'creative'],
                               tone_options=['neutral', 'positive', 'formal'],
                               content_options=['generic', 'specific', 'creative'])
text_analyzer = TextAnalyzer()

# Define the user interface using widgets
style_dropdown = widgets.Dropdown(options=text_generator.style_options, description='Style:')
tone_dropdown = widgets.Dropdown(options=text_generator.tone_options, description='Tone:')
content_dropdown = widgets.Dropdown(options=text_generator.content_options, description='Content:')
input_text = widgets.Textarea(value='Enter your text here', description='Input Text')
analyze_button = widgets.Button(description='Analyze and Generate')

# Function to update generated text based on user input
def update_generated_text(change):
    user_input = {'style': style_dropdown.value, 'tone': tone_dropdown.value, 'content': content_dropdown.value}
    
    auto_generated_attributes = text_analyzer.analyze_text(input_text.value)
    
    user_input.update(auto_generated_attributes)

    # Generate text based on the combined attributes
    generated_text = text_generator.generate_text(user_input)
    output_text.value = generated_text

# Function to handle the button click event
def analyze_and_generate(button):
    update_generated_text(None)

# Attach the function to widget events
style_dropdown.observe(update_generated_text, names='value')
tone_dropdown.observe(update_generated_text, names='value')
content_dropdown.observe(update_generated_text, names='value')
input_text.observe(update_generated_text, names='value')
analyze_button.on_click(analyze_and_generate)

# Create an output widget to display generated text
output_text = widgets.Textarea(value='', description='Generated Text', disabled=True)

# Display widgets in the Jupyter Notebook
display(style_dropdown, tone_dropdown, content_dropdown, input_text, analyze_button, output_text)


Dropdown(description='Style:', options=('formal', 'casual', 'creative'), value='formal')

Dropdown(description='Tone:', options=('neutral', 'positive', 'formal'), value='neutral')

Dropdown(description='Content:', options=('generic', 'specific', 'creative'), value='generic')

Textarea(value='Enter your text here', description='Input Text')

Button(description='Analyze and Generate', style=ButtonStyle())

Textarea(value='', description='Generated Text', disabled=True)