In [55]:
pip install numpy pandas scikit-learn nltk tensorflow transformers torch textblob gensim


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [57]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset

In [58]:
# Download NLTK resources
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
# Sample Data
data = {
    'prompt': [
        "Create a chatbot script for customer support",
        "Generate a response for mental health chat",
        "Write a conversation starter for virtual assistants",
        "Generate an image of a sunset over the ocean",
        "Create a video clip of a relaxing forest scene",
        "Develop a conversational flow for a virtual assistant handling scheduling",
        "Generate a supportive message for someone experiencing anxiety",
        "Write an introduction script for a virtual tour guide",
        "Create a realistic image of a mountain landscape",
        "Produce a short video of a cityscape at night",
        "Compose a background score for a meditation session",
        "Generate a speech script for a company announcement",
        "Create podcast episode outlines about technology trends",
        "Write a blog post on the benefits of remote work",
        "Generate a podcast script discussing climate change",
        "Develop a story outline for a children's adventure book",
        "Create social media posts for a new product launch",
        "Write product descriptions for an online store",
        "Optimize a website's content for search engines",
        "Generate a market analysis report for a new app",
        "Translate a document from English to Spanish",
        "Provide coding assistance for a Python project",
        "Integrate a third-party API into a web application",
        "Analyze sales data and generate a report",
        "Create a financial report based on quarterly earnings",
        "Develop educational content for an online course",
        "Assist in learning Spanish vocabulary",
        "Create a PowerPoint presentation on digital marketing",
        "Generate email templates for business communication",
        "Draft a legal document for a lease agreement",
        "Generate a contract for freelance work",
        "Create a healthy meal plan for a week",
        "Generate a fitness routine for beginners",
        "Solve math homework problems",
        "Detect AI-generated content in student essays",
        "Design a logo for a startup company",
        "Create a branding strategy for a new brand",
        "Plan a travel itinerary for a trip to Europe",
        "Organize an event plan for a wedding",
        "Write detailed product descriptions for an online shop",
        "Generate personalized product recommendations",
        "Draft a resume for a job application",
        "Write a cover letter for a software developer position",
        "Summarize data for a research paper",
        "Generate a summary of the latest news articles",
        "Create a news article about a recent scientific discovery",
        "Generate a route for an LLM handling diverse queries"
    ],
    'category': [
        "Communication",
        "Communication",
        "Communication",
        "Visual Art",
        "Visual Art",
        "Communication",
        "Communication",
        "Communication",
        "Visual Art",
        "Visual Art",
        "Music and Audio",
        "Music and Audio",
        "Music and Audio",
        "Writing and Content Creation",
        "Writing and Content Creation",
        "Writing and Content Creation",
        "Marketing and Advertising",
        "Marketing and Advertising",
        "Marketing and Advertising",
        "Marketing and Advertising",
        "Translation and Localization",
        "Programming and Development",
        "Programming and Development",
        "Data and Analytics",
        "Data and Analytics",
        "Education and Training",
        "Education and Training",
        "Business and Productivity",
        "Business and Productivity",
        "Legal and Professional Services",
        "Legal and Professional Services",
        "Health and Wellness",
        "Health and Wellness",
        "Homework",
        "Homework",
        "Design",
        "Design",
        "Travel and Hospitality",
        "Travel and Hospitality",
        "Retail and E-commerce",
        "Retail and E-commerce",
        "Human Resources",
        "Human Resources",
        "Science and Research",
        "Science and Research",
        "Media and Journalism",
        "Others"
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

In [61]:
# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [62]:
def preprocess_text(text):
    tokens = simple_preprocess(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_prompt'] = df['prompt'].apply(preprocess_text)

In [63]:
# Tokenization and Vectorization
# BoW and TF-IDF
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [64]:
X_bow = bow_vectorizer.fit_transform(df['cleaned_prompt'])
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_prompt'])

In [65]:
# Word2Vec Embedding
sentences = [simple_preprocess(text) for text in df['prompt']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [66]:
def get_word2vec_features(text):
    tokens = simple_preprocess(text)
    feature_vec = np.zeros(100)
    count = 0
    for word in tokens:
        if word in word2vec_model.wv:
            feature_vec += word2vec_model.wv[word]
            count += 1
    if count > 0:
        feature_vec /= count
    return feature_vec


In [67]:
X_word2vec = np.array([get_word2vec_features(text) for text in df['prompt']])

In [68]:
# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
num_classes = len(label_encoder.classes_)

In [69]:
# Split data into train and test sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['label'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)
X_train_word2vec, X_test_word2vec, _, _ = train_test_split(X_word2vec, df['label'], test_size=0.2, random_state=42)

In [70]:
# Define RNN Models with LSTM and GRU
def create_rnn_model(vocab_size, embedding_dim, input_length):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [71]:

def create_gru_model(vocab_size, embedding_dim, input_length):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(128))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [72]:
# Prepare Data for RNN
input_length = X_train_bow.shape[1]
embedding_dim = 100

In [73]:
model_lstm = create_rnn_model(input_length, embedding_dim, input_length)
model_gru = create_gru_model(input_length, embedding_dim, input_length)



In [74]:
# Training and Evaluation
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train, epochs=150, batch_size=8, validation_split=0.2, verbose=1)
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)

    unique_labels = np.unique(y_test)
    target_names = [label_encoder.classes_[i] for i in unique_labels]

    print(classification_report(y_test, y_pred_classes, labels=unique_labels, target_names=target_names))

    return y_pred_classes


In [75]:
# Train and evaluate models
print("Training LSTM model...")
y_pred_lstm = train_and_evaluate(model_lstm, X_train_bow, X_test_bow, y_train, y_test)


Training LSTM model...
Epoch 1/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 170ms/step - accuracy: 0.0138 - loss: 3.0020 - val_accuracy: 0.1250 - val_loss: 3.0114
Epoch 2/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1523 - loss: 2.9607 - val_accuracy: 0.1250 - val_loss: 3.0491
Epoch 3/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1398 - loss: 2.8924 - val_accuracy: 0.1250 - val_loss: 3.3516
Epoch 4/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1523 - loss: 2.8106 - val_accuracy: 0.1250 - val_loss: 3.4647
Epoch 5/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1856 - loss: 2.7724 - val_accuracy: 0.1250 - val_loss: 3.4859
Epoch 6/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1898 - loss: 2.6760 - val_accuracy: 0.1250 - val_loss: 3.5947
Epoch 7/150
[1m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
print("Training GRU model...")
y_pred_gru = train_and_evaluate(model_gru, X_train_bow, X_test_bow, y_train, y_test)

Training GRU model...
Epoch 1/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 390ms/step - accuracy: 0.0497 - loss: 2.9977 - val_accuracy: 0.1250 - val_loss: 2.9968
Epoch 2/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.1731 - loss: 2.9445 - val_accuracy: 0.1250 - val_loss: 3.0045
Epoch 3/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step - accuracy: 0.1273 - loss: 2.9038 - val_accuracy: 0.1250 - val_loss: 3.0554
Epoch 4/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.1523 - loss: 2.8126 - val_accuracy: 0.1250 - val_loss: 3.2169
Epoch 5/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.1773 - loss: 2.7432 - val_accuracy: 0.1250 - val_loss: 3.4514
Epoch 6/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.2106 - loss: 2.6449 - val_accuracy: 0.1250 - val_loss: 3.4914
Epoch 7/150
[1m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
# Define BERT Model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
# Tokenize and Encode for BERT
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [79]:
max_len = 50
train_dataset = CustomDataset(df['prompt'].tolist(), df['label'].tolist(), bert_tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# Training function for BERT
def train_bert_model(model, train_loader, epochs):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

In [80]:

# Train BERT Model
print("Training BERT model...")
train_bert_model(bert_model, train_loader, epochs=60)

Training BERT model...
Epoch 1/60, Loss: 3.118049144744873
Epoch 2/60, Loss: 3.131500244140625
Epoch 3/60, Loss: 2.7004053592681885
Epoch 4/60, Loss: 2.890322208404541
Epoch 5/60, Loss: 2.5996086597442627
Epoch 6/60, Loss: 2.4284496307373047
Epoch 7/60, Loss: 2.6128926277160645
Epoch 8/60, Loss: 2.4430794715881348
Epoch 9/60, Loss: 2.2674856185913086
Epoch 10/60, Loss: 2.124821186065674
Epoch 11/60, Loss: 1.8658958673477173
Epoch 12/60, Loss: 1.9452930688858032
Epoch 13/60, Loss: 1.7551578283309937
Epoch 14/60, Loss: 1.504683256149292
Epoch 15/60, Loss: 1.3911263942718506
Epoch 16/60, Loss: 1.5383832454681396
Epoch 17/60, Loss: 1.553912878036499
Epoch 18/60, Loss: 1.4498580694198608
Epoch 19/60, Loss: 1.3868862390518188
Epoch 20/60, Loss: 1.1675071716308594
Epoch 21/60, Loss: 1.0797951221466064
Epoch 22/60, Loss: 1.1791919469833374
Epoch 23/60, Loss: 1.3023689985275269
Epoch 24/60, Loss: 1.1169284582138062
Epoch 25/60, Loss: 0.9098499417304993
Epoch 26/60, Loss: 0.977121889591217
Epoch

In [81]:
def evaluate_bert_model(model, texts, labels):
    model.eval()
    predictions = []
    true_labels = []
    for text, label in zip(texts, labels):
        encoding = bert_tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)
        pred_class = torch.argmax(output.logits, dim=1).item()
        predictions.append(pred_class)
        true_labels.append(label)

    unique_labels = np.unique(true_labels)
    target_names = [label_encoder.classes_[i] for i in unique_labels]
    print(classification_report(true_labels, predictions, labels=unique_labels, target_names=target_names))

In [82]:
print("Evaluating BERT model...")
evaluate_bert_model(bert_model, df['prompt'].tolist(), df['label'].tolist())

Evaluating BERT model...
                                 precision    recall  f1-score   support

      Business and Productivity       1.00      1.00      1.00         2
                  Communication       1.00      1.00      1.00         6
             Data and Analytics       1.00      1.00      1.00         2
                         Design       1.00      1.00      1.00         2
         Education and Training       1.00      1.00      1.00         2
            Health and Wellness       1.00      1.00      1.00         2
                       Homework       1.00      1.00      1.00         2
                Human Resources       1.00      1.00      1.00         2
Legal and Professional Services       1.00      1.00      1.00         2
      Marketing and Advertising       1.00      1.00      1.00         4
           Media and Journalism       1.00      1.00      1.00         1
                Music and Audio       1.00      1.00      1.00         3
                         

In [83]:
# Prediction Function for User Input
def predict_category(text, model, tokenizer, vectorizer=None, max_len=None):
    if vectorizer:
        text = [text]  # Wrap text in a list
        if isinstance(vectorizer, CountVectorizer):
            text_vec = vectorizer.transform(text).toarray()
        elif isinstance(vectorizer, TfidfVectorizer):
            text_vec = vectorizer.transform(text).toarray()
        elif isinstance(vectorizer, Word2Vec):
            text_vec = np.array([get_word2vec_features(text[0])])
        prediction = model.predict(text_vec)
    elif tokenizer and max_len:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(output.logits, dim=1).item()
    return label_encoder.classes_[prediction]

In [84]:
def predict_category(text, model, tokenizer=None, vectorizer=None, max_len=None):
    if vectorizer:
        text = [text]  # Wrap text in a list
        if isinstance(vectorizer, CountVectorizer):
            text_vec = vectorizer.transform(text).toarray()
        elif isinstance(vectorizer, TfidfVectorizer):
            text_vec = vectorizer.transform(text).toarray()
        elif isinstance(vectorizer, Word2Vec):
            text_vec = np.array([get_word2vec_features(text[0])])
        prediction = model.predict(text_vec)
        prediction = np.argmax(prediction, axis=1)[0]  # Get the index of the highest probability
    elif tokenizer and max_len:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(output.logits, dim=1).item()

    if isinstance(prediction, np.ndarray):  # Convert numpy array to integer if necessary
        prediction = prediction[0]

    try:
        return label_encoder.classes_[prediction]
    except IndexError as e:
        print(f"Error: {e}, Prediction: {prediction}, Classes: {label_encoder.classes_}")
        return "Unknown Category"

# Example Usage
def user_input():
    while True:
        user_prompt = input("Enter a prompt: ")
        if user_prompt.lower() == 'exit':
            break

        print("LSTM Model Prediction:", predict_category(user_prompt, model_lstm, None, bow_vectorizer))
        print("GRU Model Prediction:", predict_category(user_prompt, model_gru, None, bow_vectorizer))
        print("BERT Model Prediction:", predict_category(user_prompt, bert_model, bert_tokenizer, max_len=max_len))

In [85]:
user_input()

Enter a prompt: Develop educational content for an online course
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
LSTM Model Prediction: Legal and Professional Services
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
GRU Model Prediction: Legal and Professional Services
BERT Model Prediction: Education and Training
Enter a prompt: Organize an event plan for a wedding
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
LSTM Model Prediction: Legal and Professional Services
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
GRU Model Prediction: Programming and Development
BERT Model Prediction: Travel and Hospitality
Enter a prompt: Create a healthy meal plan for a week
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
LSTM Model Prediction: Health and Wellness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
GRU Model Prediction: Health and Wellness
BER