In [None]:
#@title Install required packages & Import
!pip install -q transformers  sentencepiece

from google.colab import drive
import re

from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')
# path setting
%cd /content/drive/My Drive/Study Material/MSc ML CW/COMP0087 CW

# path = '/content/drive/My Drive/dsml/comp0087'
# path = '/content/drive/MyDrive/COMP0087'
# path = '/content/MyDrive/Study Material/MSc ML CW/COMP0087 CW'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Study Material/MSc ML CW/COMP0087 CW


For the code to run, the model and raw ICL data can be loaded in the following link

https://drive.google.com/drive/folders/1MDnQNheVok6lMdnv0ZauVVu1AzhF_CO0?usp=sharing

In [None]:
# @title Define Parameter Values & Required Functions

%%capture 
def generate_sample_summary(model, sample_id=0):
  raw_message = wikihow_data['manual_raw_message'][sample_id]

  inputs = tokenizer_t5(raw_message, return_tensors="pt")  # .to(device)
  outputs = model.generate(**inputs, min_new_tokens=10, max_new_tokens=100, 
                           early_stopping=True, no_repeat_ngram_size=2)

  generated_summary = tokenizer_t5.batch_decode(outputs, skip_special_tokens=True)[0]

  reference_summary = None
  # reference_summary = dataset['test'][10]['highlights']
  return raw_message, generated_summary, reference_summary

def print_fine_tune_result(raw_message, generated_summary, reference_summary):
  formatted_generated_summary = "\n".join(re.findall(r'\d+\D+', generated_summary))

  print('Original Message: \n{}\n'.format(raw_message))
  print('Generated Summary: \n{}\n'.format(formatted_generated_summary))
  if reference_summary: 
    print('reference_summary: ' + reference_summary)

# model names
t5 = 't5-base'
t5_gt = 'model/model_t5_base_1'
t5_cot_fine_tuned = 'model/model_t5_example'

# load fine-tuned model
model_t5_ground_truth = T5ForConditionalGeneration.from_pretrained(t5_gt)
model_t5_cot_fine_tuned = T5ForConditionalGeneration.from_pretrained(t5_cot_fine_tuned)

# load tokeniser
tokenizer_t5 = T5Tokenizer.from_pretrained(t5)

# load dataset
wikihow_data = pd.read_csv('rawdata/lite_manual_unlabeled.csv') 

In [None]:
wikihow_data.shape

(56373, 4)

# 1. ICL Demo 
This part demonstrates how LLM (GPT3 Davinci) can be utilised to synthesis high quality training data in a limited labeled data scenario. We will use Wikihow dataset for text summerisation demonstration. 
We will use 5 different prompting strategies for ICL to generate the summerisation, and demonstrate how the result can be improved by using more advanced and innovative prompting methods. 

This part is linked to source code part 4. The supporting document can be found at:
https://drive.google.com/drive/folders/1MDnQNheVok6lMdnv0ZauVVu1AzhF_CO0?usp=sharing

Here we try to demonstrate different prompt based on different prompt retrieval methods(Inside folder named "ICL_Data_6000"):

* One-shot random-retrieval: one prompt demo example is randomly selected from the demo dataset

* Two-shot random-retrieval: two prompt demo examples are randomly selected from the demo dataset

* Two-shot similarity-retrieval: two prompt demo examples are selected based on the ranking of similarity to test text. In order to measure the similarity, here we use the sentence-bart pre-trained model to convert each instructional message into 787 numerical vector so that we can calculate the embedding distances between each other. 

* Two-shot autoprompt-retrieval: unlike the pure similarity one, the autopromot method first uses the kmeans clustering algorithm to assign each demo data into one of k groups and select the most similar demo example from each group. Finally, the top 2 will be form the prompts example. Combining the kmeans is to balance the diversity and similarity of prompt examples.  

* Two-shot autoprompt-retrieval+ CoT table: once we successfully select two examples, we then use certain prompt question for each example to construct the so-called 'CoT table' to demonstrate the action, purpose, result and relationship between each step into a table, so that the reasoning ability of LLMs would be enhanced because of the clear logic connection presentation. The details can be checked in the report. 



In [None]:
# load the dataset 
# one-shot 
ICL_6000_one_shot_random = pd.read_csv('ICL_Data_6000/test_data_ans_one_shot_random.csv')

# two-shot 
ICL_6000_two_shot_random = pd.read_csv('ICL_Data_6000/test_data_ans_two_shot_random.csv')

# high-similarity
ICL_6000_two_shot_similarity = pd.read_csv('ICL_Data_6000/test_data_ans_two_shot_similarity.csv')

# high-similarity
ICL_6000_two_shot_similarity_Kmeans = pd.read_csv('ICL_Data_6000/test_data_ans_two_shot_similarity_Kmeans.csv')

# high-similarity
ICL_1500_two_shot_similarity_Kmeans_CoT = pd.read_csv('ICL_Data_6000/test_data_ans_two_shot_similarity_Kmeans_CoT.csv')

In [None]:
# function to print prompt messages from different retrieval methods
def Prompt_Demonstration(index):
  message = ''
  message += ICL_6000_one_shot_random['prompt'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_random['prompt'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_similarity['prompt'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_similarity_Kmeans['prompt'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_1500_two_shot_similarity_Kmeans_CoT['prompt'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')


# function to print generated answer from different retrieval methods
def Prompt_Ans_Demonstration(index):
  message = ''
  message += ICL_6000_one_shot_random['manual_sumamy'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_random['manual_sumamy'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_similarity['manual_sumamy'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_6000_two_shot_similarity_Kmeans['manual_sumamy'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')
  message = ''
  message += ICL_1500_two_shot_similarity_Kmeans_CoT['manual_sumamy'][index] + '\n \n' 
  print(message)
  print('--------------------------------------------------------------')

In [None]:
# index for which test data 

index = 1400

In [None]:
# prompt message for each retrival method 
# different messgaes are divided by '--------------------------------'

Prompt_Demonstration(index)

[Message]:
How to Extend a Partition in Ubuntu:
1. Open GParted: If you don't have GParted downloaded, you can get it from https://gparted.org/livecd.php. Otherwise, GParted will be located in the Dash in Ubuntu Live environments.
2. Click the partition you want to resize from the menu in the upper right corner: You can't edit partitions that are actively in use. If a partition is mounted, unmount it by clicking Eject in the file manager.
3. Right-click a partition and click Resize: When you right-click, a menu will appear next to your cursor and the resize button is usually the second option in the list.
4. Click and drag the handles on either side of the bar to expand the space: If you aren't able to make the space larger, you might need to repeat these steps to shrink a different partition first.
5. Click Resize/Move and Apply: These changes will be added to a queue that you can watch via a progress bar. This process may take a few seconds or a few minutes, depending on the number o

In [None]:
# generated answer message for each retrival method 
# different messgaes are divided by '--------------------------------'

Prompt_Ans_Demonstration(index)


How to Use the Red Eye Tool in Photoshop:
1. Put the image that you want up in Photoshop. 
2. In the toolbox, right click on the plaster (band-aid). Click on the icon that looks like an eye and is labeled "Red eye tool". 
3. Click the eyes on your image in order to remove the red eye. 
4. You will know when you have done this successfully because you will notice the eyes are no longer flashy and the image looks much better.
 

--------------------------------------------------------------


How to Use the Red Eye Tool in Photoshop:
1. Put the image that you want up in Photoshop.
2. In the toolbox, right click on the plaster (band-aid). Click on the icon that looks like an eye and is labeled "Red eye tool".
3. Click the eyes on your image in order to remove the red eye. 
4. You will know when you have done this successfully because you will notice the eyes are no longer flashy and the image looks much better.
5. You have now removed the redeye and you can now do what you want with the 

# 2. Fine-tune Demo
A smaller language model(LM) offers advantages such as having higher training efficiency and providing faster inference times. We will leverage such benefits of smaller LMs by utilising high quality synthetic data generated by LLM from the last part to fine-tune a smaller LM, T5 for demonstration purpose, aiming to achieve superior summarisation results on the Wikihow dataset. 

This part is linked to source code part 5. The supporting document can be found at:
https://drive.google.com/drive/folders/1MDnQNheVok6lMdnv0ZauVVu1AzhF_CO0?usp=sharing

* Two models have been selected as example models, which are named "model_t5_base_1" and "model_t5_example." The first model is a T5 base model fine-tuned using only the ground truth dataset, while the second model is fine-tuned using both the ground truth and CoT datasets.

* The results for the source code, which runs each model five times, are shown as CSV files listed in the following Google Drive folder and named "5_runs". These data are used to calculate the average evaluation metrics scores along with their standard deviations (used in Table 5).

* The results for the source code, which runs each model once and evaluates them for each epoch, can be found in the Google Drive folder named "epoch" (used in Figure 2)

### 2.1 Fine-tune T5 with Ground Truth Data Only

The following section presents specific examples illustrating the performance of two T5-Base models: one fine-tuned using only the ground truth dataset and another fine-tuned using both the ground truth and CoT datasets. By visualizing the differences in these particular results, we can observe some improvements made by incorporating the CoT dataset in the fine-tuning process.

In [None]:
raw_message, generated_summary, reference_summary = generate_sample_summary(model=model_t5_ground_truth, sample_id=1095)
print_fine_tune_result(raw_message, generated_summary, reference_summary)

Original Message: 
1. Type in "Cleartype" on the Start Menu in Windows. Select the "Settings" option with the gear icon.;
2. Click on the "Adjust Cleartype text" option when it appears on-screen.

3. Check the "Turn on Cleartype" box if it's not on already.

4. Click "Next" through each screen of the setup. This will involve selecting which box has text that is the most clear to you on your particular monitor.

5. Click "Finish" to complete the Cleartype setup process.



Generated Summary: 
2. Click on the "Cleartype text" option when it appears on-screen. 
2. Select the Cleartype option. 
3. Click "Next" through each screen. 
4. Click the OK button. 
5. Click OK. 
6. Click Next. 
7.



### 2.2 Fine-tune T5 with Ground Truth + ICL Data


In [None]:
# 23, 26
raw_message, generated_summary, reference_summary = generate_sample_summary(model=model_t5_cot_fine_tuned, sample_id=1095)
print_fine_tune_result(raw_message, generated_summary, reference_summary)

Original Message: 
1. Type in "Cleartype" on the Start Menu in Windows. Select the "Settings" option with the gear icon.;
2. Click on the "Adjust Cleartype text" option when it appears on-screen.

3. Check the "Turn on Cleartype" box if it's not on already.

4. Click "Next" through each screen of the setup. This will involve selecting which box has text that is the most clear to you on your particular monitor.

5. Click "Finish" to complete the Cleartype setup process.



Generated Summary: 
1. Type in Cleartype on the Start Menu in Windows. 
2. Click on "Settings". 
3. Click the "Adjust ClearType text" option. 
4. Check the box with "Turn on Cleartyping". 
5. Click Next. 
6. Click Finish.

