In [1]:
import torch

# check if GPU is available and if not, use a CPU

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f"using device:{device}")

using device:cuda


In [3]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import os

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Load the dataset
try:
    with open('/content/drive/MyDrive/interview_qna.json', 'r') as f:
        interview_data = json.load(f)
    print(f"Successfully loaded JSON. Type of data: {type(interview_data)}")
    if isinstance(interview_data, list):
        print(f"Number of items in the list: {len(interview_data)}")
        if len(interview_data) > 0:
            print(f"Structure of the first item: {interview_data[0].keys()}")
    elif isinstance(interview_data, dict):
        print(f"Keys in the dictionary: {interview_data.keys()}")
except json.JSONDecodeError:
    print("Error decoding JSON. The file might not be in valid JSON format.")
except Exception as e:
    print(f"An error occurred while loading the file: {str(e)}")

# Prepare the data for training
def prepare_data(data):
    if isinstance(data, list):
        return "\n\n".join([f"Question: {item.get('question', '')}\nAnswer: {response}" for item in data for response in item.get('responses', [])])
    elif isinstance(data, dict):
        return "\n\n".join([f"Question: {q}\nAnswer: {a}" for q, a in data.items()])
    else:
        raise ValueError(f"Unexpected data type: {type(data)}")

try:
    prepared_data = prepare_data(interview_data)
    print(f"Successfully prepared data. Length: {len(prepared_data)} characters")
    print("First 500 characters of prepared data:")
    print(prepared_data[:500])
except Exception as e:
    print(f"An error occurred while preparing the data: {str(e)}")

# Write prepared data to file
try:
    with open('train.txt', 'w') as f:
        f.write(prepared_data)
    print("Successfully wrote prepared data to train.txt")
except Exception as e:
    print(f"An error occurred while writing to file: {str(e)}")

# Load pre-trained model and tokenizer
model_name = "EleutherAI/gpt-neo-125M"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Prepare the dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./interview_model",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    report_to="none",
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./interview_qnda_model")
tokenizer.save_pretrained("./interview_qnda_model")

print("Model training completed and saved.")

Successfully loaded JSON. Type of data: <class 'dict'>
Keys in the dictionary: dict_keys(['interview_questions'])
Successfully prepared data. Length: 20814 characters
First 500 characters of prepared data:
Question: interview_questions
Answer: [{'category': 'General', 'questions': [{'question': 'Tell me about yourself.', 'answer': 'I am Abul Khair, an aspiring Data Scientist currently pursuing a B.E. in Artificial Intelligence and Machine Learning at Yenepoya Institute of Technology. I have hands-on experience in machine learning, deep learning, and quantum computing, gained through internships and personal projects. I’ve developed applications like a flight price prediction model using RandomFore
Successfully wrote prepared data to train.txt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4218 > 2048). Running this sequence through the model will result in indexing errors


Step,Training Loss


Model training completed and saved.


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def load_model():
    model = AutoModelForCausalLM.from_pretrained("./interview_qnda_model")
    tokenizer = AutoTokenizer.from_pretrained("./interview_qnda_model")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def generate_response(question, model, tokenizer):
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    attention_mask = inputs['attention_mask']

    output = model.generate(
        inputs['input_ids'],
        attention_mask=attention_mask,
        max_length=150,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.split("Answer:")[1].strip()

def main():
    print("Improved Interview Response Chatbot")
    print("Enter 'quit' to exit the chatbot.")

    model, tokenizer = load_model()

    while True:
        question = input("\nEnter your interview question: ")
        if question.lower() == 'quit':
            break

        response = generate_response(question, model, tokenizer)
        print("\nChatbot Response:")
        print(response)

if __name__ == "__main__":
    main()

Improved Interview Response Chatbot
Enter 'quit' to exit the chatbot.

Enter your interview question: which is your favorite book

Chatbot Response:
I am particularly drawn to the introduction by John Dewey, which has drawn the most critical and thoughtful attention to its subject matter. Dewry's approach has been particularly influential, and his discussion of the importance of history and geography for the American West has received a wide variety of critical, political, literary, health, beauty, science, technology, etc. reviews. I have also been impressed by his presentation of complex concepts like geography and the natural world as well as his use of metaphor and metaphors to illustrate complex points. Finally, I would like to thank my editor, Mark Fields, for his valuable feedback, particularly on the initial edits.
-
I am a huge fan of your books.

Enter your interview question: why do you want to join this company?

Chatbot Response:
I want this team to grow and develop, and I