In [1]:
from dataset_creation import get_sentiment_dataset, get_stance_dataset, SPLIT_TRAIN, SPLIT_VAL, SPLIT_TEST
from tqdm import tqdm 
import json

## Sample Query
```
beam(2)
"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
The input contains a tweet from social media.
First explain the tweet. Then, select the sentiment of the tweet given in the input. Select the sentiment from:
- negative
- neutral
- positive

### Input
So disappointed in wwe summerslam! I want to see john cena wins his 16th title

### Response:
Explanation:
[EXPLN]

Sentiment:
[RESPONSE]
"""
from 
    lmql.model("/scratch/artahir/llama/hf_weights/llama2-13B-phase2-aug")
where
    len(EXPLN) < 200 and
    len(RESPONSE) < 100 and 
    RESPONSE in ["negative", "neutral", "positive"]
```

In [2]:
STANCE_SAVE_PATH = "stance_alpaca.json"
SENTIMENT_SAVE_PATH = "sentiment_alpaca.json"

In [3]:
# format the data as instruction/input/output
INSTRUCTION_SENTIMENT_COT = """The input contains a tweet from social media.
First explain the tweet. Then, select the sentiment of the tweet given in the input. Select the sentiment from:
- negative
- neutral
- positive"""

INSTRUCTION_STANCE_COT = """The input contains a tweet from social media.
First explain the tweet. Then, select the stance of the tweet given in the input. Select the stance from:
- none
- against
- favor"""

MAPPING_STANCE = {
    0: 'none',
    1: 'against', 
    2: 'favor'
}

MAPPING_SENTIMENT = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

In [4]:
dataset_stance = {
    'train': [], 
    'test': [],
    'val': []
}

dataset_sentiment = {
    'train': [],
    'test': [],
    'val': []
}

for split_name, split in tqdm([('train', SPLIT_TRAIN), ('test', SPLIT_TEST), ('val', SPLIT_VAL)]):
    x, y = get_stance_dataset(split)
    x, y = x.to_numpy(), y.to_numpy()
    iter_data = zip(x, y)
    for x_i, y_i in iter_data:
        dataset_stance[split].append(
            {
                'instruction': INSTRUCTION_STANCE_COT,
                'input': x_i.item(),
                'output': MAPPING_STANCE[y_i.item()]
            }
        )

for split_name, split in tqdm([('train', SPLIT_TRAIN), ('test', SPLIT_TEST), ('val', SPLIT_VAL)]):
    x, y = get_sentiment_dataset(split)
    x, y = x.to_numpy(), y.to_numpy()
    iter_data = zip(x, y)
    for x_i, y_i in iter_data:
        dataset_sentiment[split].append(
            {
                'instruction': INSTRUCTION_SENTIMENT_COT,
                'input': x_i.item(),
                'output': MAPPING_SENTIMENT[y_i.item()]
            }
        )

100%|██████████| 3/3 [00:00<00:00, 87.99it/s]
100%|██████████| 3/3 [00:00<00:00, 24.06it/s]


In [5]:
with open(STANCE_SAVE_PATH, 'w') as stance_save_file:
    json.dump(dataset_stance, stance_save_file, indent=2)
    
with open(SENTIMENT_SAVE_PATH, 'w') as sentiment_save_file:
    json.dump(dataset_sentiment, sentiment_save_file, indent=2)

## Heapq exercises

In [1]:
import bisect

In [13]:
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [15]:
bisect.bisect_left(x, 4.5)

4

In [6]:
x = [6,5,4,3,2,1]

In [7]:
bisect.bisect_left(x, 3.5)

6

In [11]:
bisect.insort(x, 3.5)

In [16]:
x

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [17]:
import heapq

In [18]:
numbers = [3,1,4,1,5,9,2,6,5,3,5]

In [19]:
heapq.heapify(numbers)

In [20]:
numbers

[1, 1, 2, 3, 3, 9, 4, 6, 5, 5, 5]

In [21]:
heapq.heappush(numbers, 7)

In [22]:
numbers

[1, 1, 2, 3, 3, 7, 4, 6, 5, 5, 5, 9]