In [None]:
import os
import json
import re
import pickle
import jsonlines
import random
import pandas as pd
import numpy as np
from tqdm import tqdm 
import matplotlib.pyplot as plt

import gpt_wrapper
from gpt_wrapper.chat import Chat
from dotenv import load_dotenv
load_dotenv()

In [None]:
model_args={"temperature": 0.7, "top_p": 0.7, "presence_penalty": 0.0, "frequency_penalty": 0.0, "max_new_tokens": 64}

### Filtering Further the Wikipedia subset of 133K samples using GPTWrapper and GPT3.5 

In [None]:
relevant_keywords = {
    'computer science', 
    'computer software', 
    'computer systems', 
    'machine learning', 
    'artificial intelligence',
    'mathematics',
    'physics',
    'cybersecurity',
}

In [None]:
# Loading from the wikipedia_8_keywords.json file
filtered_data_with_keywords = pd.read_json('data_wikipedia/wikipedia_8_keywords.json', orient='records', lines=True)

# Display the first few rows of the dataset
print(filtered_data_with_keywords.head())

In [None]:
def initial_prompt(document, keywords):
    prompt = f'''You are a classifier. Determine if the following document is related to the given keywords based on it's Title and Content.
    Keywords: {", ".join(keywords)}
    Document Title: {document['title']}
    Document Content: {document['text']}
    Answer with "Yes" or "No" only.'''
    return prompt

In [None]:
def generate_predictions_zero_shot(document, relevant_keywords, model_args):
    instruction= "You are a helpful educational AI bot. Your task is to determine if the following document is related to the given keywords. Answer ONLY with 'Yes' if the document is even remotely related to the keywords. Answer with 'No' if you are certain that the document is not related to the keywords at all."
    with jsonlines.open(f"data_wikipedia/wikipedia_8_keywords_gpt3.5.json", mode="w") as writer:
        for example in tqdm(document):
            # Limit the context length to the first 100 lines
            limited_text = example["text"].split('\n')[:100]  # Split the text into lines and take the first 50
            limited_example = example.copy()  # Create a copy of the example
            limited_example["text"] = '\n'.join(limited_text)  # Join the limited text back into a single string
            prompt = initial_prompt(limited_example, relevant_keywords)
            chat_id = random.randrange(0, 2**16,)
            chat = Chat.create(name=f"{chat_id}")
            message = chat.ask(prompt, model_args=model_args, instruction=instruction)
            preds = message.content.strip()
            if preds:
                pred = preds
            else:
                pred = "none"

            print("Document Title:", example["title"])
            print("Predicted answer:", preds)

            example["prediction"] = preds  # Add the prediction to the example dictionary
            writer.write(example)  # Write the example dictionary to the JSON file

In [None]:
# Generate predictions for the filtered dataset
document_dataset = filtered_data_with_keywords.to_dict('records')
predictions = generate_predictions_zero_shot(document_dataset, relevant_keywords, model_args)
print(predictions)