In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [15]:
def load_and_preprocess_data(csv_file, max_words=10000, max_len=20):
    # Load CSV data
    df = pd.read_csv('Lyrical_Verses_csv/Karma_Police.csv')

    # Concatenate all lyrics into a single corpus
    corpus = df['Line'].str.cat(sep=' ')

    # Tokenize words
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts([corpus])
    total_words = len(tokenizer.word_index) + 1  # Add 1 for OOV token

    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences([corpus])[0]

    # Create input sequences using `max_len` words per sequence
    input_sequences = []
    for i in range(max_len, len(sequences)):
        seq = sequences[i-max_len:i]
        input_sequences.append(seq)

    # Convert input sequences to numpy arrays
    input_sequences = np.array(input_sequences)

    # Split sequences into X (input) and y (output)
    X = input_sequences[:, :-1]  # All but the last word as input
    y = input_sequences[:, -1]   # Last word as output

    return X, y, total_words, tokenizer


In [16]:
def create_lstm_model(total_words, max_len):
    model = Sequential([
        Embedding(total_words, 100, input_length=max_len-1),
        LSTM(150),
        Dense(total_words, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return model

# Example function to train the LSTM model
def train_model(model, X, y, epochs=50):
    model.fit(X, y, epochs=epochs, verbose=1)


In [20]:
import numpy as np

def generate_text(model, tokenizer, max_len, seed_text, next_words=100):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted_probs = model.predict(token_list)[0]  # Predict probabilities for the next word
        predicted_index = np.argmax(predicted_probs)    # Get the index of the most probable word
        output_word = tokenizer.index_word.get(predicted_index, '<OOV>')  # Convert index to word
        seed_text += " " + output_word
    return seed_text


In [22]:
# Example usage
csv_file = 'Lyrical_Verses_csv/Karma_Police.csv'  # Replace with your CSV file path

max_words = 10000
max_len = 20

# Load and preprocess data
X, y, total_words, tokenizer = load_and_preprocess_data(csv_file, max_words, max_len)

# Create and train LSTM model
model = create_lstm_model(total_words, max_len)
train_model(model, X, y)

# Generate text
seed_text = "exhausted"  # Starting seed for text generation
generated_text = generate_text(model, tokenizer, max_len, seed_text)
print("Generated Lyrics:")
print(generated_text)



Epoch 1/50




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 4.0754
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 4.0403
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 3.9781
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 3.7710
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 3.4198
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 3.2919
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 3.1057
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 2.9797
Epoch 9/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 2.8842
Epoch 10/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 2.8471
Epoch 11/50
[1m4/4[0m [32m━