<a href="https://colab.research.google.com/github/arpan-sharma/PickupLineGen/blob/main/Pickuplinemain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

**STEP-1 DATA PROCESSING **

*   List item
*   List item



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Mount Google Drive to access files

# Read pickup lines from a text file
pickup_lines_file = '/content/sample_data/newAllPickupline2.txt'

with open(pickup_lines_file, "r", encoding="utf-8") as file:
    pickup_lines = file.readlines()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pickup_lines)
total_words = len(tokenizer.word_index) + 1

# Convert text sequences to numerical sequences
input_sequences = []
for line in pickup_lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Creating predictors and labels
X, y = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Splitting the dataset into training and validation sets
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


** Step 2: Model Selection **

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Model architecture
model = Sequential()
model.add(Embedding(total_words, 550, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


**Step 3: Model Training**

>

> Add blockquote





In [None]:
# Training the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), verbose=1)


**Step 4: Fine-tuning**

In [None]:
# Evaluate model on validation set
loss, accuracy = model.evaluate(X_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
def generate_pickup_line(seed_text):
    if seed_text.startswith("name"):
        pickup_line = "My name is Pickupwit"
    else:
        pickup_line = seed_text
        word_count = 0
        while True:
            token_list = tokenizer.texts_to_sequences([pickup_line])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted_index = np.argmax(predicted_probs)
            output_word = ""
            for word, index in tokenizer.word_index.items():
                if index == predicted_index:
                    output_word = word
                    break
            pickup_line += " " + output_word
            word_count += 1
            # Check if the output word is a punctuation mark or if the pickup line exceeds the desired length
            if output_word in [".", "!", "?"] or word_count >= 15:
                break
    return pickup_line

# Loop for generating pickup lines
while True:
    user_input = input("Enter a single word or place name (type 'exit' to quit): ").lower()

    if user_input == "exit":
        print("Goodbye!")
        break

    generated_line = generate_pickup_line(user_input)
    print("Generated Pickup Line:", generated_line)


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Define your dataset
class PickupLinesDataset(Dataset):
    def __init__(self, pickup_lines, tokenizer, max_length):
        self.pickup_lines = pickup_lines
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pickup_lines)

    def __getitem__(self, idx):
        text = self.pickup_lines[idx]
        input_ids = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True)
        return torch.tensor(input_ids, dtype=torch.long)

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Read pickup lines from file
pickup_lines_file = '/content/sample_data/newAllPickupline2.txt'
with open(pickup_lines_file, "r", encoding="utf-8") as file:
    pickup_lines = file.readlines()

# Prepare dataset and dataloader
max_length = 50  # Adjust max_length as needed
dataset = PickupLinesDataset(pickup_lines, tokenizer, max_length)

# Get the padding token ID
padding_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0  # Default to 0 if padding token ID is not available

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: pad_sequence(x, batch_first=True, padding_value=padding_token_id))

# Fine-tuning parameters
num_epochs = 3
learning_rate = 5e-5

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Fine-tune the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    for batch in dataloader:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# Save the fine-tuned model
output_dir = './fine_tuned_gpt2_pickup_lines/'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
