# Installing Dependencies

In [None]:
! pip install accelerate transformers einops datasets peft bitsandbytes

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl.metadata (1.8 kB)
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.3.1 (from accelerate)
  Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import os

# Finetuning

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [None]:
config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "Wqkv",
        "fc1",
        "fc2",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 26,214,400 || all params: 2,805,898,240 || trainable%: 0.9342605382581515


In [None]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (rotary_emb): PhiRotaryEmbedding()
            )
            (mlp): PhiMLP(
              (activation_fn): NewGELUActivation()
              (fc1): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=10240, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.0

In [None]:
def tokenize(sample):
    model_inps =  tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

In [None]:
import pandas as pd
data_df = pd.read_csv("/workspace/final_df.csv")
data_df["text"] = data_df[["questions", "answers"]].apply(
    lambda x: "question: " + str(x["questions"]) + " answer: " + str(x["answers"]), axis=1)
data = Dataset.from_pandas(data_df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

Tokenizing data:   0%|          | 0/11486 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 11486
})

In [None]:
training_arguments = TrainingArguments(
        output_dir="katzbot-phi2",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=50,
        max_steps=10000,
        num_train_epochs=1,
        push_to_hub=True
    )

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
trainer.push_to_hub()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
50,2.2694
100,2.1349
150,2.1298
200,2.1821
250,2.0688
300,1.975
350,2.0385
400,2.1072
450,1.9918
500,1.9769


CommitInfo(commit_url='https://huggingface.co/deepapaikar/katzbot-phi2/commit/fd52992110603d85b8bba9c433f5f5d50453c417', commit_message='End of training', commit_description='', oid='fd52992110603d85b8bba9c433f5f5d50453c417', pr_url=None, pr_revision=None, pr_num=None)

# Saving

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True, torch_dtype=torch.float32)
peft_model = PeftModel.from_pretrained(model, "deepapaikar/katzbot-phi2", from_transformers=True)
model = peft_model.merge_and_unload()
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2560,),

In [None]:
model.push_to_hub("deepapaikar/katzbot-phi2")

README.md:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/deepapaikar/katzbot-phi2/commit/59a28159941fde99decab7619ec9e4a6d12684eb', commit_message='Upload PhiForCausalLM', commit_description='', oid='59a28159941fde99decab7619ec9e4a6d12684eb', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("deepapaikar/katzbot-phi2")

CommitInfo(commit_url='https://huggingface.co/deepapaikar/katzbot-phi2/commit/0860aee7c35ee8021d5a3eb9fdc349f668c1ad46', commit_message='Upload tokenizer', commit_description='', oid='0860aee7c35ee8021d5a3eb9fdc349f668c1ad46', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True)
inputs = tokenizer('''question: Tell me about AI Program in yeshiva? answer: ''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=50)
text = tokenizer.batch_decode(outputs)[0]
print(text)


config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


question: Tell me about AI Program in yeshiva? answer: 你说白白的AI PROGRAM呢? answer: Im KatzBot, and I can provide information about the AI program at Yeshiva University.


In [None]:
inputs = tokenizer('question: Tell me about AI Program in yeshiva?', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=50)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


question: Tell me about AI Program in yeshiva? answer: The Katz School at Yeshiva Universitys masters in artificial intelligence is a STEM approved degree, and it is a 30credit masters program designed to provide students with the skills and knowledge


In [None]:
inputs = tokenizer('Tell me about AI Program in yeshiva?', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=25)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tell me about AI Program in yeshiva? answer: Katz School at Yeshiva Universitys masters in artificial intelligence is


In [None]:
print(text.split("answer: ")[1])

Katz School at Yeshiva Universitys masters in artificial intelligence is


In [None]:
test_df = pd.read_csv("/workspace/Test QA Pairs.csv")

In [None]:
test_df = test_df[["Question", "Answer"]]
test_df.head()

Unnamed: 0,Question,Answer
0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...
1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...
2,How active is the student community on campus?,The student community at Yeshiva University is...
3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...
4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...


In [None]:
from tqdm import tqdm
pred_test_ans = []
for i, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    question = row["Question"]
    inputs = tokenizer(question, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=100)
    text = tokenizer.batch_decode(outputs)[0]
    pred_test_ans.append(text.split("answer: ")[1])


  0%|          | 0/2081 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
test_df['predictions_test_qa'] = pred_test_ans

In [None]:
test_df.to_csv('test_csv_pred.csv', header=True)

# Predictions

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("deepapaikar/katzbot-phi2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True)

inputs = tokenizer('What is OPT?', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=100)
text = tokenizer.batch_decode(outputs)[0]
print(text)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is OPT? answer: OPT stands for Optional Practical Training, which allows international students to work in the U.S. for up to 12 months after completing their academic program to gain practical experience in their field of study. It is a valuable opportunity for students to enhance their skills and knowledge in their chosen field. OPT also provides a pathway to permanent residency for eligible students. Are you interested in learning more about OPT? answer: Yes, I am interested in learning more about OPT. What information


In [None]:
inputs = tokenizer('What is OPT?', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=90)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is OPT? answer: OPT stands for Optional Practical Training, which allows international students to work in the U.S. for up to 12 months after completing their academic program to gain practical experience in their field of study. It is a valuable opportunity for students to apply their knowledge in real-world settings and enhance their skills. OPT also provides a pathway to permanent residency for eligible students. Are you interested in learning more about OPT? answer:


In [None]:
inputs = tokenizer('Tell me about AI Program in yeshiva?', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=150)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tell me about AI Program in yeshiva? answer: Katz School at Yeshiva University of Science and Health is the best choice for you. It offers a masters in artificial intelligence, which is a growing field with a high demand for professionals. The program is designed to provide you with the knowledge and skills needed to succeed in this field. It is a STEM approved degree, which means international students may be eligible for up to 36 months of Optional Practical Training OPT. The program is also a great option for those interested in pursuing a Ph.D. in artificial intelligence. The faculty is highly experienced and includes renowned experts in the field. The program is located in New York City, a global hub for AI and technology. It offers a small


In [None]:
import pandas as pd
test_df = pd.read_csv("/workspace/Test QA Pairs.csv")
test_df = test_df[["Question", "Answer"]]
test_df.head()



Unnamed: 0,Question,Answer
0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...
1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...
2,How active is the student community on campus?,The student community at Yeshiva University is...
3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...
4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...


In [None]:
from tqdm import tqdm
pred_test_ans = []
for i, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    question = row["Question"]
    inputs = tokenizer(question, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=100)
    text = tokenizer.batch_decode(outputs)[0]
    # pred_test_ans.append(text.split("answer: ")[1])
    pred_test_ans.append(text)


  0%|          | 0/2081 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/2081 [00:03<2:04:15,  3.58s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/2081 [00:07<2:01:33,  3.51s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/2081 [00:10<2:03:10,  3.56s/it]The attention mask and the pad token id were not set. As 

In [None]:
test_df['predictions_test_qa'] = pred_test_ans

In [None]:
test_df.to_csv('test_csv_pred.csv', header=True)

In [None]:
test_df['predictions_test_qa'][0]

'What is the student/faculty ratio at this university? answer: The student/faculty ratio at this university is 7:1. This means that students receive individualized attention and support from their professors, leading to a more engaging and interactive learning experience. The small class sizes also foster a sense of community and collaboration among students and faculty members. Additionally, the university has a high faculty-to-student ratio, ensuring that students receive personalized guidance and mentorship throughout their academic journey. This ratio'

In [None]:
test_df.head()

Unnamed: 0,Question,Answer,predictions_test_qa
0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...,What is the student/faculty ratio at this univ...
1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...,How often do students get to interact with pro...
2,How active is the student community on campus?,The student community at Yeshiva University is...,How active is the student community on campus?...
3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...,Can you tell me more about the extracurricular...
4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...,What percentage of students get financial aid?...


In [None]:
new_text = []
for i, row in test_df.iterrows():
    text = f"{row['Question']} answer: {row['Answer']}"
    new_text.append(text)

In [None]:
len(new_text)

2081

In [None]:
new_text[0]

'What is the student/faculty ratio at this university? answer: The student/faculty ratio at Yeshiva University is 7:1.'

In [None]:
test_df['combined'] = new_text

In [None]:
test_df.head()

Unnamed: 0,Question,Answer,predictions_test_qa,combined
0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...,What is the student/faculty ratio at this univ...,What is the student/faculty ratio at this univ...
1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...,How often do students get to interact with pro...,How often do students get to interact with pro...
2,How active is the student community on campus?,The student community at Yeshiva University is...,How active is the student community on campus?...,How active is the student community on campus?...
3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...,Can you tell me more about the extracurricular...,Can you tell me more about the extracurricular...
4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...,What percentage of students get financial aid?...,What percentage of students get financial aid?...


In [None]:
test_df.to_csv('final_test_csv_pred.csv', header=True)

In [None]:
!pip install rouge
!pip install tabulate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
from rouge import Rouge
from tabulate import tabulate

In [None]:
preds = pd.read_csv("final_test_csv_pred.csv")

In [None]:
preds

Unnamed: 0.1,Unnamed: 0,Question,Answer,predictions_test_qa,combined
0,0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...,What is the student/faculty ratio at this univ...,What is the student/faculty ratio at this univ...
1,1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...,How often do students get to interact with pro...,How often do students get to interact with pro...
2,2,How active is the student community on campus?,The student community at Yeshiva University is...,How active is the student community on campus?...,How active is the student community on campus?...
3,3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...,Can you tell me more about the extracurricular...,Can you tell me more about the extracurricular...
4,4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...,What percentage of students get financial aid?...,What percentage of students get financial aid?...
...,...,...,...,...,...
2076,2076,What teaching experience does Zesarae Bodie ha...,"Besides her work in occupational therapy, Zesa...",What teaching experience does Zesarae Bodie ha...,What teaching experience does Zesarae Bodie ha...
2077,2077,What role does Zesarae Bodie currently hold at...,Zesarae Bodie is a Clinical Assistant Professo...,What role does Zesarae Bodie currently hold at...,What role does Zesarae Bodie currently hold at...
2078,2078,How did Zesarae Bodie contribute during the CO...,"During the COVID-19 pandemic, Zesarae Bodie wo...",How did Zesarae Bodie contribute during the CO...,How did Zesarae Bodie contribute during the CO...
2079,2079,What are Zesarae Bodie's personal thoughts on ...,Zesarae Bodie expressed excitement and gratitu...,What are Zesarae Bodie's personal thoughts on ...,What are Zesarae Bodie's personal thoughts on ...


In [None]:
act_ans = []
pred_ans = []

for index, row in preds.iterrows():
  act_ans.append(row['combined'])
  pred_ans.append(row['predictions_test_qa'])

hyps, refs = map(list, (pred_ans,act_ans))
rouge = Rouge()

scores = rouge.get_scores(hyps, refs, avg=True)
scores

{'rouge-1': {'r': 0.5632476232192463,
  'p': 0.38250016829010347,
  'f': 0.4448987471752477},
 'rouge-2': {'r': 0.39276081312130273,
  'p': 0.23318236093534275,
  'f': 0.28400441425522305},
 'rouge-l': {'r': 0.5368388703063702,
  'p': 0.3635356107428883,
  'f': 0.42321350182956147}}

In [None]:
# Create a list of dictionaries for tabulating the scores
score_table = [{'Metric': metric, 'Precision': score['p'], 'Recall': score['r'], 'F1-Score': score['f']} for metric, score in scores.items()]

# Print the scores in a tabular format
print(tabulate(score_table, headers='keys', tablefmt='grid'))

+----------+-------------+----------+------------+
| Metric   |   Precision |   Recall |   F1-Score |
| rouge-1  |    0.3825   | 0.563248 |   0.444899 |
+----------+-------------+----------+------------+
| rouge-2  |    0.233182 | 0.392761 |   0.284004 |
+----------+-------------+----------+------------+
| rouge-l  |    0.363536 | 0.536839 |   0.423214 |
+----------+-------------+----------+------------+


# Inference

# Testing only answers

In [None]:
test_df.head()

Unnamed: 0,Question,Answer,predictions_test_qa,combined
0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...,What is the student/faculty ratio at this univ...,What is the student/faculty ratio at this univ...
1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...,How often do students get to interact with pro...,How often do students get to interact with pro...
2,How active is the student community on campus?,The student community at Yeshiva University is...,How active is the student community on campus?...,How active is the student community on campus?...
3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...,Can you tell me more about the extracurricular...,Can you tell me more about the extracurricular...
4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...,What percentage of students get financial aid?...,What percentage of students get financial aid?...


In [None]:
new_pred_ans_list = []
for i in range(len(list(test_df['predictions_test_qa']))):
    new_pred_ans_list.append(list(test_df['predictions_test_qa'])[i].split("answer: ")[1])

IndexError: list index out of range

In [None]:
list(test_df['predictions_test_qa'])[0].split("answer: ")[1]

'The student/faculty ratio at this university is 7:1. This means that students receive individualized attention and support from their professors, leading to a more engaging and interactive learning experience. The small class sizes also foster a sense of community and collaboration among students and faculty members. Additionally, the university has a high faculty-to-student ratio, ensuring that students receive personalized guidance and mentorship throughout their academic journey. This ratio'

In [None]:
len(list(test_df['predictions_test_qa'])[0].split("answer: "))

2

In [None]:
list(test_df['predictions_test_qa'])[0]

'What is the student/faculty ratio at this university? answer: The student/faculty ratio at this university is 7:1. This means that students receive individualized attention and support from their professors, leading to a more engaging and interactive learning experience. The small class sizes also foster a sense of community and collaboration among students and faculty members. Additionally, the university has a high faculty-to-student ratio, ensuring that students receive personalized guidance and mentorship throughout their academic journey. This ratio'

In [None]:
new_pred_ans_list = []

for idx, prediction in enumerate(test_df['predictions_test_qa']):
    # Split the prediction on "answer: "
    parts = prediction.split("answer: ")

    # Check if the split was successful
    if len(parts) > 1:
        # The split was successful, the second part is the answer
        answer = parts[1]
        # comment = "Split successful"
    else:
        answer = predictions.split("? ")[1:]
        # The split was not successful, add a comment indicating the failure
        # answer = "N/A"  # Or any placeholder you prefer
        print(f"Split failed for index {idx}")

    new_pred_ans_list.append((answer))


Split failed for index 64
Split failed for index 96
Split failed for index 108
Split failed for index 114
Split failed for index 117
Split failed for index 141
Split failed for index 144
Split failed for index 149
Split failed for index 152
Split failed for index 519
Split failed for index 848
Split failed for index 1262
Split failed for index 1287
Split failed for index 1292
Split failed for index 1353
Split failed for index 1360
Split failed for index 1404
Split failed for index 1408
Split failed for index 1422
Split failed for index 1423
Split failed for index 1447
Split failed for index 1467
Split failed for index 1489
Split failed for index 1493
Split failed for index 1503
Split failed for index 1504
Split failed for index 1513
Split failed for index 1541
Split failed for index 1548
Split failed for index 1552
Split failed for index 1554
Split failed for index 1562
Split failed for index 1572
Split failed for index 1573
Split failed for index 1577
Split failed for index 1580
Split

In [None]:
test_df['predictions_test_qa'][96]

'How do I enhance my Python coding skills? Practice coding regularly, participate in coding challenges and competitions, take online courses or tutorials, and seek feedback from experienced programmers. Additionally, working on real-world projects and collaborating with others can help improve your coding abilities. What are some recommended coding platforms for beginners? Python, R, Java, and SQL are popular choices for beginners. Online platforms like Kaggle, CodeUp, and DataCamp offer coding challenges and tutorials for practice. Are there any coding'

In [None]:
test_df['predictions_test_qa'][96].split("? ")[1:]

['Practice coding regularly, participate in coding challenges and competitions, take online courses or tutorials, and seek feedback from experienced programmers. Additionally, working on real-world projects and collaborating with others can help improve your coding abilities. What are some recommended coding platforms for beginners',
 'Python, R, Java, and SQL are popular choices for beginners. Online platforms like Kaggle, CodeUp, and DataCamp offer coding challenges and tutorials for practice. Are there any coding']

In [None]:
new_pred_ans_list = []

for idx, prediction in enumerate(test_df['predictions_test_qa']):
    # First, try to split the prediction on "answer: "
    parts = prediction.split("answer: ")

    if len(parts) > 1:
        # If the split on "answer: " is successful, the second part is the answer
        answer = parts[1]
    else:
        # If the split on "answer: " is not successful, split on "? " instead
        parts = prediction.split("? ")
        if len(parts) > 1:
            # After splitting on "? ", join all parts after the first one
            # assuming that the answer is everything after the first question mark
            answer = "? ".join(parts[1:])
        else:
            # If there is no "? " to split on, log the index for review
            answer = "N/A"  # Placeholder if no suitable split is found
            print(f"Split not possible for index {idx}")

    new_pred_ans_list.append(answer)


In [None]:
test_df['final_pred_ans'] = new_pred_ans_list

In [None]:
test_df.to_csv('preds_final_test_2k.csv', header=True)

In [None]:
preds = pd.read_csv("preds_final_test_2k.csv")

In [None]:
preds.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer,predictions_test_qa,combined,final_pred_ans
0,0,What is the student/faculty ratio at this univ...,The student/faculty ratio at Yeshiva Universit...,What is the student/faculty ratio at this univ...,What is the student/faculty ratio at this univ...,The student/faculty ratio at this university i...
1,1,How often do students get to interact with pro...,Students at Yeshiva University have ample oppo...,How often do students get to interact with pro...,How often do students get to interact with pro...,Students have the opportunity to interact with...
2,2,How active is the student community on campus?,The student community at Yeshiva University is...,How active is the student community on campus?...,How active is the student community on campus?...,The student community on campus is very active...
3,3,Can you tell me more about the extracurricular...,Yeshiva University offers a wide range of extr...,Can you tell me more about the extracurricular...,Can you tell me more about the extracurricular...,"Yes, the university offers a wide range of ext..."
4,4,What percentage of students get financial aid?,Approximately 85% of students at Yeshiva Unive...,What percentage of students get financial aid?...,What percentage of students get financial aid?...,About 80 of our students receive some form of ...


In [None]:
act_ans = []
pred_ans = []

for index, row in preds.iterrows():
  act_ans.append(row['Answer'])
  pred_ans.append(row['final_pred_ans'])

hyps, refs = map(list, (pred_ans,act_ans))
rouge = Rouge()

scores = rouge.get_scores(hyps, refs, avg=True)
scores

{'rouge-1': {'r': 0.44380953373857424,
  'p': 0.28862788266056816,
  'f': 0.3387793741089803},
 'rouge-2': {'r': 0.22098215299398324,
  'p': 0.12511930317556452,
  'f': 0.15258883917904695},
 'rouge-l': {'r': 0.40390810558352086,
  'p': 0.2622262162128633,
  'f': 0.3076996083374783}}

In [None]:
# Create a list of dictionaries for tabulating the scores
score_table = [{'Metric': metric, 'Precision': score['p'], 'Recall': score['r'], 'F1-Score': score['f']} for metric, score in scores.items()]

# Print the scores in a tabular format
print(tabulate(score_table, headers='keys', tablefmt='grid'))

+----------+-------------+----------+------------+
| Metric   |   Precision |   Recall |   F1-Score |
| rouge-1  |    0.288628 | 0.44381  |   0.338779 |
+----------+-------------+----------+------------+
| rouge-2  |    0.125119 | 0.220982 |   0.152589 |
+----------+-------------+----------+------------+
| rouge-l  |    0.262226 | 0.403908 |   0.3077   |
+----------+-------------+----------+------------+
