## Imports

In [None]:
!pip install datasets
!pip install --upgrade openai
!pip install --upgrade langchain

Collecting datasets
  Using cached datasets-2.19.1-py3-none-any.whl (542 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m27.6 MB/s[0m eta 

In [None]:
from datasets import load_dataset
import openai
import os
import IPython
from langchain.llms import OpenAI
import pandas as pd
import json
import re

In [None]:
from google.colab import userdata
key = userdata.get('OPENAI_API_KEY')

openai.api_key = key

os.environ["OPENAI_API_KEY"] = key

## Functions

In [None]:
def set_open_params(
    model="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
):
    """ set openai parameters"""

    openai_params = {}

    openai_params['model'] = model
    openai_params['temperature'] = temperature
    openai_params['max_tokens'] = max_tokens
    openai_params['top_p'] = top_p
    openai_params['frequency_penalty'] = frequency_penalty
    openai_params['presence_penalty'] = presence_penalty
    return openai_params

def get_completion(params, messages):
    response = openai.chat.completions.create(
        model = params['model'],
        messages = messages,
        temperature = params['temperature'],
        max_tokens = params['max_tokens'],
        top_p = params['top_p'],
        frequency_penalty = params['frequency_penalty'],
        presence_penalty = params['presence_penalty'],
    )
    return response

In [None]:
data_split = ["train", "validation", "test"]

def load_data(name:str):
    dataset = load_dataset(name)
    dataset.set_format("pandas")
    data = {}
    for split in data_split:
        data[split] = dataset[split][:] if split in dataset.keys() else None

    return data

### Prompt format testing

In [None]:
def generate_prompt(sentence1, sentence2):
    prompt = f"""You have to extend the given 2 sentences to N more sentences, respectively, where N is a number randomly chosen from 3 to 7. And you should still retain their semantic relatedness, which is given by a score ranging from 0-5, the higher the score is, the more related the sentences are.

```desired format:
// use brackets to indicate the generated paragraphs
paragraph1: [<generated paragraph 1>]
paragraph2: [<generated paragraph 2>]
// do not print out the score
```query:
<sentence1>: {sentence1}
<sentence2>: {sentence2}
<score>: 4.6

```output:
"""
    return prompt

In [None]:
params = set_open_params()

sentence1 = "U.S. prosecutors have arrested more than 130 individuals and have seized more than $17 million in a continuing crackdown on Internet fraud and abuse." # @param {type:"string"}
sentence2 = "More than 130 people have been arrested and $17 million worth of property seized in an Internet fraud sweep announced Friday by three U.S. government agencies." # @param {type:"string"}
prompt = generate_prompt(sentence1, sentence2)
#print("prompt: ", prompt)

messages = [
    {
        "role": "user",
        "content": prompt
    }
]

response = get_completion(params, messages)


In [None]:
response.choices[0].message.content

'paragraph1: [The crackdown on Internet fraud and abuse by U.S. prosecutors has been highly successful, resulting in the arrest of over 130 individuals. Additionally, the authorities have managed to seize an impressive $17 million worth of assets. This ongoing effort to combat online scams and deception is crucial in protecting innocent victims from falling prey to fraudulent schemes. The sheer magnitude of the arrests and seizures serves as a powerful deterrent to would-be criminals who seek to exploit others through illegal means.]\n\nparagraph2: [The joint operation carried out by three U.S. government agencies to tackle Internet fraud has yielded significant results, with more than 130 people being taken into custody. Furthermore, the seizure of $17 million in assets sends a strong message that such criminal activities will not be tolerated. The collaboration between these agencies underscores the importance of a united front in the fight against cybercrime. This coordinated effort

## Dataset creation

### Load raw data

In [None]:
data_list = ['mteb/stsbenchmark-sts', 'mteb/sts12-sts', 'mteb/sts13-sts', 'mteb/sts14-sts', 'mteb/sts15-sts']
df = []
for name in data_list:
    data = load_data(name)
    for split in data_split:
        if data[split] is None:
            continue
        for _, row in data[split].iterrows():
            df.append([row["sentence1"], row["sentence2"], row["score"]])
df = pd.DataFrame(df, columns=["sentence1", "sentence2", "score"])
df.drop_duplicates(inplace=True, ignore_index=True)
df.to_json("./data.json", orient="records")
print(df)

                                               sentence1  \
0                                 A plane is taking off.   
1                        A man is playing a large flute.   
2          A man is spreading shreded cheese on a pizza.   
3                           Three men are playing chess.   
4                            A man is playing the cello.   
...                                                  ...   
20386                a baseball player throws the ball .   
20387           a man is swinging on a rope over water .   
20388  a woman wearing large sunglasses holds newspap...   
20389                             a deer jumps a fence .   
20390  a young girl dressed in a minnie mouse outfit ...   

                                               sentence2  score  
0                            An air plane is taking off.   5.00  
1                              A man is playing a flute.   3.80  
2      A man is spreading shredded cheese on an uncoo...   3.80  
3              

### Sentence extension


In [None]:
llm_name = "gpt-3.5-turbo" # @param ["gpt-3.5-turbo", "gpt-4"]
params = set_open_params(model=llm_name)

In [None]:
with open('data.json', 'r') as file:
    df = json.load(file)
extended_data=[]
cnt = 0
for data in df:
    print(f"data {cnt}/20351")
    cnt += 1
    prompt = generate_prompt(data['sentence1'], data['sentence2'])
    #print("prompt: ", prompt)
    messages = [
        {
            "role": "user",
            "content": prompt
        }
    ]
    response = get_completion(params, messages)
    query_result = response.choices[0].message.content
    print(query_result)
    paragraphs = re.findall(r'\[([^\]]+)\]', query_result)

    paragraph1 = paragraphs[0]
    paragraph2 = paragraphs[1]
    score = data['score']
    extended_data.append([paragraph1, paragraph2, score])
extended_data.to_json("./extended_data.json", orient="records")
print(extended_data)

data 0/20351
paragraph1: [The plane slowly gains altitude as it takes off from the runway, the engines roaring with power. The passengers feel a mix of excitement and nervousness as the aircraft lifts into the sky, leaving the ground below.]

paragraph2: [An air plane, with its sleek design, gracefully ascends into the clouds as it takes off from the airport. The passengers inside watch in awe as the scenery below becomes smaller and smaller, signaling the beginning of their journey.]
data 1/20351
paragraph1: [The man playing the large flute was a skilled musician, effortlessly moving his fingers along the keys to produce beautiful melodies. The sound of the flute echoed through the room, captivating everyone who listened. It was clear that music was not just a hobby for him, but a passion that he poured his heart and soul into. The intricate notes he played filled the air with a sense of tranquility and joy, creating a mesmerizing atmosphere in the room.]

paragraph2: [Meanwhile, the 

IndexError: list index out of range