<a href="https://colab.research.google.com/github/ZAIN007777/Urdu-Poem-Generator/blob/main/RomanToUrdu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 pandas




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Base URL
base_url = "https://www.rekhta.org"
output_file = "rekhta_all_tags.csv"


# Function to get all tag URLs
def get_tag_urls():
    tags_url = f"{base_url}/tags"
    response = requests.get(tags_url)
    if response.status_code != 200:
        print("Failed to retrieve the tags page")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    tag_links = soup.find_all("a", class_="rfTag")  # Find all tag elements
    tag_urls = [base_url + tag["href"] for tag in tag_links if tag.get("href")]

    return tag_urls


# Function to scrape couplets from a given tag URL
def scrape_couplets(tag_url):
    response = requests.get(tag_url)
    if response.status_code != 200:
        print(f"Failed to retrieve page: {tag_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    div_c_list = soup.find_all("div", class_="c")

    data = []
    for div_c in div_c_list:
        line1 = " ".join(
            [span.text.strip() for span in div_c.find_all("p", {'data-l': '1'})[0].find_all("span")]) if div_c.find("p",
                                                                                                                    {
                                                                                                                        'data-l': '1'}) else ""
        line2 = " ".join(
            [span.text.strip() for span in div_c.find_all("p", {'data-l': '2'})[0].find_all("span")]) if div_c.find("p",
                                                                                                                    {
                                                                                                                        'data-l': '2'}) else ""
        data.append({"Tag URL": tag_url, "line1": line1, "line2": line2})

    return data


# Main script
all_tag_urls = get_tag_urls()  # Step 1: Get all tag URLs

# If file does not exist, create it with headers
if not os.path.exists(output_file):
    pd.DataFrame(columns=["Tag URL", "line1", "line2"]).to_csv(output_file, index=False, encoding="utf-8")

for index, tag_url in enumerate(all_tag_urls):
    print(f"Scraping {index + 1}/{len(all_tag_urls)}: {tag_url}")

    # Scrape data
    scraped_data = scrape_couplets(tag_url)

    # Append data to CSV
    if scraped_data:
        df = pd.DataFrame(scraped_data)
        df.to_csv(output_file, mode="a", index=False, encoding="utf-8", header=False)  # Append mode

    time.sleep(1)  # Be respectful and avoid rate limiting

print(f"Data successfully written to {output_file}")


Scraping 1/621: https://www.rekhta.org/tags/aab-shayari/couplets
Scraping 2/621: https://www.rekhta.org/tags/aab-deedah-shayari/couplets
Scraping 3/621: https://www.rekhta.org/tags/aabla-shayari/couplets
Scraping 4/621: https://www.rekhta.org/tags/aadmi-shayari/couplets
Scraping 5/621: https://www.rekhta.org/tags/aagahi-shayari/couplets
Scraping 6/621: https://www.rekhta.org/tags/aah-shayari/couplets
Scraping 7/621: https://www.rekhta.org/tags/aahat-shayari/couplets
Scraping 8/621: https://www.rekhta.org/tags/aiina-shayari/couplets
Scraping 9/621: https://www.rekhta.org/tags/aajizi-shayari/couplets
Scraping 10/621: https://www.rekhta.org/tags/aam-shayari/nazms
Scraping 11/621: https://www.rekhta.org/tags/aanch-shayari/couplets
Scraping 12/621: https://www.rekhta.org/tags/aangan-shayari/couplets
Scraping 13/621: https://www.rekhta.org/tags/aansoo-shayari/couplets
Scraping 14/621: https://www.rekhta.org/tags/aarzoo-shayari/couplets
Scraping 15/621: https://www.rekhta.org/tags/aashiq-shay

KeyboardInterrupt: 

In [None]:
!pip install pandas numpy tensorflow




In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pickle

# Load data
df = pd.read_csv('rekhta_all_tags.csv')

# Combine text from both columns into a list of sentences
corpus = df['line1'].fillna('') + ' ' + df['line2'].fillna('')

# Convert to lowercase and remove any extra whitespace
corpus = corpus.str.lower().str.strip().tolist()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input-output pairs
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad input sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Define input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Ensure `num_classes` matches the max token index
num_classes = max(y) + 1
y = tf.keras.utils.to_categorical(y, num_classes=num_classes)

# Define model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_length-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(100, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=150, batch_size=64)

# Sampling function to generate new poetry
def generate_poetry(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probabilities = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probabilities)
        predicted_word = tokenizer.index_word.get(predicted_word_index, "")
        seed_text += ' ' + predicted_word
    return seed_text

# Save tokenizer after training
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Generate new poetry
seed_text = "teri yaad"
generated_poetry = generate_poetry(seed_text)
print(generated_poetry)





Epoch 1/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 195ms/step - accuracy: 0.0322 - loss: 6.9462
Epoch 2/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 199ms/step - accuracy: 0.0407 - loss: 6.3174
Epoch 3/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 198ms/step - accuracy: 0.0461 - loss: 6.1312
Epoch 4/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 201ms/step - accuracy: 0.0615 - loss: 5.9688
Epoch 5/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 199ms/step - accuracy: 0.0656 - loss: 5.8278
Epoch 6/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 199ms/step - accuracy: 0.0706 - loss: 5.6476
Epoch 7/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 195ms/step - accuracy: 0.0815 - loss: 5.4709
Epoch 8/150
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 205ms/step - accuracy: 0.0890 - loss: 5.3234
Epoch 9/

In [None]:
model.save("poetry_model.h5")


