### Packages & troubleshoot

In [1]:
!pip install transformers
!pip install accelerate
!pip install torch
!pip install torchvision torchaudio
!pip install gc
!pip install bitsandbytes

Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Using cached transformers-4.49.0-py3-none-any.whl (10.0 MB)
Using cached huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux

In [1]:
import gc

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import bitsandbytes

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [3]:
import os
import pandas as pd
import numpy as np
import pickle
import re
import json
import tqdm
from tqdm import tqdm
# Enable tqdm for pandas
tqdm.pandas()

In [4]:
# Check if CUDA (GPU) is available
print(torch.cuda.is_available())  

True


In [5]:
#Memory usage handling
#Frees up unused GPU memory that was previously allocated by PyTorch.
torch.cuda.empty_cache()

#Triggers Python's garbage collector to reclaim unused objects in memory.
gc.collect()

40

### LLAMA Implementation

In [6]:
with open("access_token_llama.txt", "r") as file: 
    token = file.read() 
#os.environ["HUGGINGFACE_HUB_TOKEN"] = token

In [7]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"

##### Other implementation

In [8]:
# model_id = "meta-llama/Llama-3.3-70B-Instruct"
# quantization_config_arg = BitsAndBytesConfig(load_in_4bit=True)

# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model_id,
#     model_kwargs={"torch_dtype": torch.bfloat16},
#     device_map="auto",
#     quantization_config=quantization_config_arg,
#     token = token)

##### Model loading

In [9]:
quantization_config_arg = BitsAndBytesConfig(load_in_4bit=True)#,
                                            #llm_int8_enable_fp32_cpu_offload=True)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config_arg,
    torch_dtype=torch.bfloat16
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [10]:
model.hf_device_map

{'': 0}

#### Model implementation

##### Prompt build up

In [11]:
#General classification prompt {cliff_prompt}
with open("prompt/llama_cliff_prompt.txt", "r") as file: 
    cliff_prompt = file.read() 

#Codebook with topic and subtopic descriptions {topics_descrip}
with open("prompt/1_ordered_codebook_prompt.txt", "r") as file:  
    topics_descrip = file.read()

#Output examples {json_examples}
with open("prompt/output_examples_prompt.txt", "r") as file:  
    json_examples = file.read() 

# Trial texts {text}
#with open("prompt/nyt_prompt_example.txt", "r") as file: 
#    text = file.read()  

with open("prompt/knitting_prompt_example.txt", "r") as file:
    text = file.read()

In [12]:
text_type =  "New York Times"
#text_type =  "USA Congress"

full_cliff_prompt = cliff_prompt.format(text_type = text_type,
                                   json_examples = json_examples,
                                   topics_descrip = topics_descrip,
                                   text = text,
                                   year = "2024")

In [13]:
print(full_cliff_prompt)

<begin_of_text_token>You are a classifier for economic inequality-related texts based on a codebook. You will receive a text from the New York Times (year: 2024) that deals with economic inequality, and your task is to classify it according to the most prominent topic and subtopic. Use the codebook provided below.

# Instructions:
1. Determine the TOPIC:
Select the single primary topic (numbered 1 to 14) that best matches the text's main theme based on the descriptions provided below. Every text has a topic, choose the one that most closely aligns with the dominant message or has according keywords.
2. Determine the SUBTOPIC:
Select the corresponding primary subtopic (letter A to D). Use 'none' as subtopic if no subtopic aligns with the text within the chosen topic.
3. Justify:
Provide a justification of why you applied the selected code.

# JSON Output description:
- Respond with a JSON in the following fashion: 
{"topic": "number", "subtopic": "letter", "justification": "[insert 1-2 

##### Prompt trial

In [14]:
# Tokenize input and move to CUDA
inputs = tokenizer(full_cliff_prompt, return_tensors="pt").to("cuda")  # Move tensors to GPU

# Generate response
outputs = model.generate(**inputs, max_new_tokens=80,
                         pad_token_id=tokenizer.eos_token_id)

In [15]:
# Decode and print output
response_spec = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [16]:
response_spec[-500:]

'eedle so that the loops from the prior row can be pulled off the other needle without unraveling.\n\n# JSON Output: \n{"topic": "12", "subtopic": "A", "justification": "The text mentions the knitting industry\'s potential to pollute the environment, leading to greater economic inequalities for the global south, which relates to environmental degradation and its impact on economic inequality."} \nNote: The JSON Output provided is just an example, you should provide your own answer based on the text. \n'

##### Answer post-processing

In [17]:
#Regex explanation
#.: Matches any character except a newline (\n).
#*: Matches zero or more of the preceding character (in this case, any character).
#?: Makes the * non-greedy (or lazy), meaning it matches as few characters as 
#possible. Without the ?, the regex would try to match the longest possible 
#string, which would cause issues when dealing with multiple sets of curly braces.

In [18]:
def extract_last_json(text, idx, idx_error, column):
    # Find all occurrences of JSON-like (between keys) objects
    text_subset = text[-500:]
    matches = re.findall(r'\{.*?\}', text_subset.strip())
    json_error_keywords = ["number", "letter"]
    
    if matches:
        if not any(keyword in matches[-1] for keyword in json_error_keywords):
            return matches[-1]
        else:
            print(f"Warning: Error JSON in row {idx}")
            idx_error.append({"idx":idx, "error": "error_json", "column" : column})
            return text[-500:]  # Return the last match
    else:
        print(f"Warning: No JSON was found in row {idx}.")
        idx_error.append({"idx":idx, "error": "no_json", "column" : column})
        return text[-500:]

In [19]:
idx, idx_error = 0, []
json_result = extract_last_json(response_spec, idx, idx_error, "ordered")
print(json_result)

{"topic": "12", "subtopic": "A", "justification": "The text mentions the knitting industry's potential to pollute the environment, leading to greater economic inequalities for the global south, which relates to environmental degradation and its impact on economic inequality."}


##### Sampled dataset answers

In [20]:
# Dropped in server
# Path: "..\3_stm_fit_R\out_meta_stm.csv"
out_meta = pd.read_csv("out_meta_stm.csv")

In [21]:
out_meta.head(1)

Unnamed: 0,id_,author,year,date,year_n,source,dataset,text,tokens_R
0,0,MARC F. PLATTNER; Marc F. Plattner is on the s...,1980,1980-06-08,1,nyt,nyt,WHO GETS WHAT THE ZERO-SUM SOCIETY Distributi...,get zero sum society distribution possibilitie...


In [22]:
#General classification prompt {cliff_prompt}
with open("prompt/llama_cliff_prompt.txt", "r") as file: 
    cliff_prompt = file.read()

#Codebook with topic and subtopic descriptions {topics_descrip}
with open("prompt/1_ordered_codebook_prompt.txt", "r") as file:  
    ordered_topics_descrip = file.read() 
# with open("prompt/2_reversed_codebook_prompt.txt", "r") as file:  
#     reversed_topics_descrip = file.read() 
# with open("prompt/3_shuffled_codebook_prompt.txt", "r") as file:  
#     shuffled_topics_descrip = file.read() 
    
#Output examples {json_examples}
with open("prompt/output_examples_prompt.txt", "r") as file:
    json_examples = file.read() 

In [23]:
def cliff_case(row, cliff_prompt, json_examples, topics_descrip, idx, idx_error, column):
    text = row["text"]
    year = row["year"]
    text_type = "New York Times" if row["dataset"] == 'nyt' else "USA Congress"
    cliff_prompt = cliff_prompt.format(text_type = text_type,
                                       json_examples = json_examples,
                                       topics_descrip = topics_descrip,
                                       text = text,
                                       year = year)

    # Tokenize input and move to CUDA
    inputs = tokenizer(cliff_prompt, return_tensors="pt").to("cuda")  # Move tensors to GPU
    # Generate response
    outputs = model.generate(**inputs, 
                             max_new_tokens=90, 
                             pad_token_id=tokenizer.eos_token_id,
                             temperature=0.2)
    #Read outputs
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    #Extract last json
    json_result = extract_last_json(response, idx, idx_error, column)
    #{"justification": "Introduction of the INVEST Act, which focuses on helping small- and medium-sized businesses by allowing them to recover their costs faster.", "topic": "5", "subtopic": "A"} 

    return json_result

In [24]:
def llama_inference(df, cliff_prompt, json_examples):
    df_cliff = df.copy()  # Avoid modifying the original DataFrame
    idx_error = []
    df_cliff["llama_cliff_4bit_2502_ordered"] = df.progress_apply(lambda row: cliff_case(row,
                                                                       cliff_prompt, 
                                                                       json_examples, 
                                                                       ordered_topics_descrip,
                                                                       row.name, idx_error, column = "ordered"),
                                                                  axis=1)
    
    # df_cliff["llama_cliff_4bit_2402_reversed"] = df.progress_apply(lambda row: cliff_case(row,
    #                                                                    cliff_prompt, 
    #                                                                    json_examples, 
    #                                                                    reversed_topics_descrip,
    #                                                                    row.name, idx_error, column = "reversed"),
    #                                                               axis=1)
    
    # df_cliff["llama_cliff_4bit_2402_shuffled"] = df.progress_apply(lambda row: cliff_case(row,
    #                                                                    cliff_prompt, 
    #                                                                    json_examples, 
    #                                                                    shuffled_topics_descrip,
    #                                                                    row.name, idx_error, column = "shuffled"),
    #                                                               axis=1)

    #def most_common_result(row):
    #    results = [{"topic": r["topic"], "subtopic": r["subtopic"]} for r in [row["llama_cliff_4bit_2402_ordered"], row["llama_cliff_4bit_2402_reversed"], row["llama_cliff_4bit_2402_shuffled"]] if r is not None]
    #    if results:
    #        most_common = Counter(map(tuple, results)).most_common(1)
    #        return dict(most_common[0][0]) if most_common else {"topic": row["llama_cliff_4bit_2402_ordered"]["topic"], "subtopic": row["llama_cliff_4bit_2402_ordered"]["subtopic"]}
    #    return {"topic": row["llama_cliff_4bit_2402_ordered"]["topic"], "subtopic": row["llama_cliff_4bit_2402_ordered"]["subtopic"]}
    
    #df_cliff["llama_cliff_4bit_2402_most_common"] = df_cliff.apply(most_common_result, axis=1)
    
    idx_error_df = pd.DataFrame(idx_error)

    
    idx_error_df = pd.DataFrame(idx_error)
    
    return df_cliff, idx_error_df

In [25]:
df1 = out_meta.sample(n=10, axis=0)

In [26]:
df2, idx_error_df = llama_inference(df1, cliff_prompt,json_examples)

100%|██████████| 10/10 [00:58<00:00,  5.85s/it]


In [27]:
for i,text in enumerate(df2["llama_cliff_4bit_2502_ordered"]):
    print(i, text)

0 {"topic": "2", "subtopic": "B", "justification": "The text discusses tax provisions, specifically the leasing of unused tax benefits, and its implications on corporate income tax, which falls under the topic of Macroeconomics and the subtopic of Taxation."}
1 {"topic": "6", "subtopic": "A", "justification": "The text focuses on introducing legislation to correct a longtime inequity in military compensation, specifically providing separation pay benefits for enlisted personnel who are involuntarily separated from the service, which falls under public sector employment and NYC policies, particularly public employment salaries."}
2 {"topic": "4", "subtopic": "A", "justification": "The text discusses the conversion of existing rental apartments to condominium or cooperative ownership, focusing on the benefits and drawbacks of this trend, including tax advantages, appreciation, land use, preservation of housing stock, conservation, participatory democracy, and civic responsibility."}
3 {"

In [28]:
idx_error_df

##### Dataset run

In [30]:
chunk_size = 2000
num_chunks = len(out_meta) // chunk_size + (1 if len(out_meta) % chunk_size != 0 else 0)

In [None]:
output_folder = "llama_cliffs"

for i in range(num_chunks):
    # Define the chunk range
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(out_meta))
    out_meta_chunk = out_meta[start_idx:end_idx] 
    print(f"Processing chunk {i+1} ({start_idx}:{end_idx})")

    # Run inference
    chunk_cliff, chunk_error_idx = llama_inference(out_meta_chunk, cliff_prompt, json_examples)

    # Save individual chunk results in the "llama_cliffs" folder
    chunk_cliff.to_csv(os.path.join(output_folder, f'out_meta_chunk_{i+1}_cliff_2502.csv'), index=False)
    chunk_error_idx.to_pickle(os.path.join(output_folder, f'idx_error_chunk_{i+1}_cliff_2502.pkl'))

Processing chunk 1 (0:2000)


  4%|▎         | 72/2000 [06:12<3:48:12,  7.10s/it]



  6%|▌         | 113/2000 [09:43<2:44:45,  5.24s/it]



 14%|█▍        | 279/2000 [24:04<2:51:50,  5.99s/it]



 15%|█▍        | 293/2000 [25:31<4:04:18,  8.59s/it]



 17%|█▋        | 336/2000 [29:19<2:25:35,  5.25s/it]



 18%|█▊        | 367/2000 [31:59<2:44:06,  6.03s/it]



 18%|█▊        | 368/2000 [32:04<2:41:29,  5.94s/it]



 23%|██▎       | 453/2000 [39:40<2:25:16,  5.63s/it]



 24%|██▎       | 473/2000 [41:20<2:23:38,  5.64s/it]



 25%|██▌       | 501/2000 [43:53<2:26:23,  5.86s/it]



 26%|██▌       | 511/2000 [44:46<2:49:19,  6.82s/it]



 28%|██▊       | 568/2000 [49:47<2:22:58,  5.99s/it]



 31%|███▏      | 625/2000 [54:42<2:19:22,  6.08s/it]



 31%|███▏      | 628/2000 [54:58<2:10:42,  5.72s/it]



 34%|███▍      | 682/2000 [59:37<2:18:00,  6.28s/it]



 35%|███▍      | 699/2000 [1:01:08<1:50:34,  5.10s/it]



 35%|███▌      | 706/2000 [1:01:41<1:32:58,  4.31s/it]



 36%|███▋      | 725/2000 [1:03:15<1:47:57,  5.08s/it]



 38%|███▊      | 766/2000 [1:06:51<1:54:24,  5.56s/it]



 39%|███▊      | 771/2000 [1:07:19<1:53:55,  5.56s/it]



 40%|███▉      | 793/2000 [1:09:17<2:06:59,  6.31s/it]



 45%|████▍     | 894/2000 [1:18:13<1:42:10,  5.54s/it]



 45%|████▍     | 896/2000 [1:18:24<1:44:53,  5.70s/it]



 48%|████▊     | 960/2000 [1:24:06<1:30:42,  5.23s/it]



 54%|█████▍    | 1080/2000 [1:34:46<1:20:29,  5.25s/it]



 62%|██████▏   | 1233/2000 [1:48:23<1:08:36,  5.37s/it]



 67%|██████▋   | 1344/2000 [1:57:37<58:12,  5.32s/it]  



 68%|██████▊   | 1354/2000 [1:58:30<58:40,  5.45s/it]  



 68%|██████▊   | 1363/2000 [1:59:23<1:05:33,  6.17s/it]



 72%|███████▏  | 1430/2000 [2:05:47<56:34,  5.96s/it]  



 72%|███████▏  | 1442/2000 [2:06:55<58:10,  6.25s/it]



 74%|███████▍  | 1482/2000 [2:10:40<48:54,  5.67s/it]  



 76%|███████▌  | 1522/2000 [2:13:55<38:49,  4.87s/it]



 78%|███████▊  | 1557/2000 [2:17:02<44:44,  6.06s/it]



 81%|████████  | 1623/2000 [2:22:57<35:45,  5.69s/it]



 81%|████████▏ | 1629/2000 [2:23:24<29:34,  4.78s/it]



 82%|████████▏ | 1649/2000 [2:25:20<34:49,  5.95s/it]



 83%|████████▎ | 1654/2000 [2:25:45<30:09,  5.23s/it]



 85%|████████▌ | 1700/2000 [2:29:54<24:46,  4.96s/it]



 86%|████████▌ | 1713/2000 [2:31:12<37:00,  7.74s/it]



 86%|████████▌ | 1715/2000 [2:31:24<33:16,  7.01s/it]



 90%|█████████ | 1800/2000 [2:39:20<18:17,  5.49s/it]



 95%|█████████▌| 1905/2000 [2:48:54<08:35,  5.43s/it]



100%|█████████▉| 1992/2000 [2:56:38<00:39,  4.98s/it]



100%|██████████| 2000/2000 [2:57:30<00:00,  5.33s/it]


Processing chunk 2 (2000:4000)


  4%|▍         | 80/2000 [07:02<3:01:00,  5.66s/it]



  6%|▌         | 117/2000 [10:20<2:39:52,  5.09s/it]



  6%|▋         | 128/2000 [11:30<2:53:47,  5.57s/it]



  8%|▊         | 159/2000 [14:18<2:30:44,  4.91s/it]



 10%|▉         | 192/2000 [17:16<2:49:16,  5.62s/it]



 15%|█▌        | 300/2000 [27:04<2:39:43,  5.64s/it]



 15%|█▌        | 301/2000 [27:10<2:47:48,  5.93s/it]



 27%|██▋       | 545/2000 [49:22<2:04:47,  5.15s/it]



 28%|██▊       | 563/2000 [51:05<2:37:27,  6.57s/it]



 29%|██▉       | 585/2000 [53:00<2:11:12,  5.56s/it]



 30%|███       | 602/2000 [54:29<1:58:08,  5.07s/it]



 30%|███       | 606/2000 [54:52<2:08:42,  5.54s/it]



 33%|███▎      | 657/2000 [59:47<1:54:53,  5.13s/it]



 33%|███▎      | 662/2000 [1:00:15<2:09:19,  5.80s/it]



 34%|███▍      | 680/2000 [1:01:57<2:12:59,  6.04s/it]



 34%|███▍      | 690/2000 [1:02:48<1:53:13,  5.19s/it]



 40%|███▉      | 791/2000 [1:11:49<1:49:56,  5.46s/it]



 42%|████▏     | 845/2000 [1:16:33<1:48:25,  5.63s/it]



 44%|████▍     | 876/2000 [1:19:29<1:50:46,  5.91s/it]



 44%|████▍     | 884/2000 [1:20:11<1:39:53,  5.37s/it]



 46%|████▌     | 914/2000 [1:23:00<1:38:30,  5.44s/it]



 46%|████▋     | 929/2000 [1:24:18<1:35:57,  5.38s/it]



 47%|████▋     | 948/2000 [1:26:09<2:29:27,  8.52s/it]



 48%|████▊     | 970/2000 [1:28:14<1:49:38,  6.39s/it]



 50%|████▉     | 994/2000 [1:30:29<1:46:41,  6.36s/it]



 50%|█████     | 1005/2000 [1:31:28<1:31:58,  5.55s/it]



 51%|█████     | 1013/2000 [1:32:10<1:32:22,  5.62s/it]



 54%|█████▍    | 1082/2000 [1:38:22<1:33:35,  6.12s/it]



 58%|█████▊    | 1151/2000 [1:44:36<1:21:05,  5.73s/it]



 59%|█████▉    | 1180/2000 [1:47:24<1:17:02,  5.64s/it]



 60%|█████▉    | 1195/2000 [1:48:44<1:13:34,  5.48s/it]



 67%|██████▋   | 1345/2000 [2:01:47<54:18,  4.98s/it]  



 73%|███████▎  | 1457/2000 [2:11:49<53:48,  5.94s/it]  



 73%|███████▎  | 1464/2000 [2:12:28<53:45,  6.02s/it]



 74%|███████▎  | 1471/2000 [2:13:03<47:45,  5.42s/it]



 74%|███████▎  | 1473/2000 [2:13:14<48:51,  5.56s/it]



 75%|███████▍  | 1495/2000 [2:15:12<45:34,  5.41s/it]



 82%|████████▏ | 1641/2000 [2:28:31<34:05,  5.70s/it]  



 84%|████████▍ | 1686/2000 [2:32:37<30:11,  5.77s/it]



 84%|████████▍ | 1689/2000 [2:32:57<32:16,  6.23s/it]



 86%|████████▌ | 1718/2000 [2:35:33<28:09,  5.99s/it]



 93%|█████████▎| 1858/2000 [2:48:09<13:02,  5.51s/it]



 94%|█████████▎| 1873/2000 [2:49:28<11:38,  5.50s/it]



 94%|█████████▍| 1888/2000 [2:50:51<10:05,  5.41s/it]



 96%|█████████▋| 1927/2000 [2:54:09<06:19,  5.20s/it]



 97%|█████████▋| 1946/2000 [2:55:49<04:50,  5.38s/it]



 98%|█████████▊| 1953/2000 [2:56:22<03:37,  4.64s/it]



100%|██████████| 2000/2000 [3:00:30<00:00,  5.42s/it]


Processing chunk 3 (4000:6000)


  8%|▊         | 152/2000 [13:14<2:50:02,  5.52s/it]



  8%|▊         | 166/2000 [14:32<3:03:56,  6.02s/it]



  9%|▉         | 177/2000 [15:35<3:08:23,  6.20s/it]



 11%|█         | 215/2000 [19:11<3:11:43,  6.44s/it]



 11%|█▏        | 226/2000 [20:04<2:28:46,  5.03s/it]



 12%|█▏        | 248/2000 [21:56<2:40:32,  5.50s/it]



 14%|█▍        | 285/2000 [25:24<2:41:13,  5.64s/it]



 15%|█▌        | 305/2000 [27:07<2:36:01,  5.52s/it]



 17%|█▋        | 337/2000 [30:02<3:01:23,  6.54s/it]



 18%|█▊        | 352/2000 [31:28<2:39:47,  5.82s/it]



 22%|██▏       | 437/2000 [38:59<2:23:04,  5.49s/it]



 22%|██▏       | 444/2000 [39:40<2:39:51,  6.16s/it]



 22%|██▏       | 449/2000 [40:08<2:33:31,  5.94s/it]



 23%|██▎       | 460/2000 [41:17<2:27:19,  5.74s/it]



 25%|██▍       | 499/2000 [44:48<2:36:30,  6.26s/it]



 30%|██▉       | 596/2000 [53:04<2:07:33,  5.45s/it]



 31%|███       | 614/2000 [54:47<2:07:06,  5.50s/it]



 34%|███▍      | 681/2000 [1:00:53<2:05:39,  5.72s/it]



 38%|███▊      | 754/2000 [1:07:16<2:11:49,  6.35s/it]



 38%|███▊      | 769/2000 [1:08:35<2:00:49,  5.89s/it]



 39%|███▉      | 783/2000 [1:09:49<1:48:37,  5.36s/it]



 40%|████      | 800/2000 [1:11:18<1:50:01,  5.50s/it]



 41%|████      | 821/2000 [1:13:05<1:47:38,  5.48s/it]



 42%|████▏     | 835/2000 [1:14:19<1:38:36,  5.08s/it]



 44%|████▎     | 872/2000 [1:17:25<1:39:50,  5.31s/it]



 46%|████▌     | 921/2000 [1:21:56<1:47:59,  6.01s/it]



 48%|████▊     | 970/2000 [1:26:08<1:34:08,  5.48s/it]



 49%|████▉     | 979/2000 [1:26:53<1:21:38,  4.80s/it]



 49%|████▉     | 983/2000 [1:27:14<1:27:02,  5.14s/it]



 49%|████▉     | 987/2000 [1:27:36<1:34:09,  5.58s/it]



 56%|█████▌    | 1122/2000 [1:39:38<1:39:32,  6.80s/it]



 59%|█████▉    | 1186/2000 [1:45:28<1:45:43,  7.79s/it]



 60%|██████    | 1200/2000 [1:46:43<1:17:04,  5.78s/it]



 62%|██████▏   | 1231/2000 [1:49:29<1:14:50,  5.84s/it]



 64%|██████▍   | 1283/2000 [1:54:00<1:00:35,  5.07s/it]



 66%|██████▌   | 1314/2000 [1:57:01<1:11:00,  6.21s/it]



 66%|██████▌   | 1319/2000 [1:57:28<1:05:28,  5.77s/it]



 67%|██████▋   | 1344/2000 [1:59:45<1:13:43,  6.74s/it]



 68%|██████▊   | 1359/2000 [2:01:05<57:48,  5.41s/it]  



 69%|██████▉   | 1381/2000 [2:03:14<1:13:34,  7.13s/it]



 69%|██████▉   | 1387/2000 [2:03:50<1:04:18,  6.29s/it]



 77%|███████▋  | 1541/2000 [2:17:47<45:23,  5.93s/it]  



 77%|███████▋  | 1547/2000 [2:18:19<43:52,  5.81s/it]



 78%|███████▊  | 1568/2000 [2:20:23<41:56,  5.83s/it]



 79%|███████▉  | 1582/2000 [2:21:46<42:17,  6.07s/it]



 83%|████████▎ | 1657/2000 [2:28:18<32:12,  5.64s/it]



 85%|████████▌ | 1700/2000 [2:32:08<29:50,  5.97s/it]



 85%|████████▌ | 1701/2000 [2:32:16<32:57,  6.61s/it]



 88%|████████▊ | 1763/2000 [2:37:47<22:45,  5.76s/it]



 88%|████████▊ | 1768/2000 [2:38:15<21:02,  5.44s/it]



 90%|█████████ | 1803/2000 [2:41:30<18:32,  5.65s/it]



 90%|█████████ | 1810/2000 [2:42:12<18:24,  5.81s/it]



 94%|█████████▍| 1875/2000 [2:48:02<11:24,  5.47s/it]



 95%|█████████▍| 1896/2000 [2:49:50<09:05,  5.25s/it]



 96%|█████████▌| 1920/2000 [2:52:00<07:52,  5.91s/it]



 98%|█████████▊| 1955/2000 [2:55:22<04:25,  5.89s/it]



 98%|█████████▊| 1960/2000 [2:55:56<04:53,  7.35s/it]



 99%|█████████▉| 1976/2000 [2:57:24<02:17,  5.72s/it]



100%|██████████| 2000/2000 [2:59:47<00:00,  5.39s/it]


Processing chunk 4 (6000:8000)


  1%|          | 19/2000 [01:39<3:31:41,  6.41s/it]



  2%|▏         | 48/2000 [04:19<2:57:11,  5.45s/it]



  3%|▎         | 63/2000 [05:39<3:00:23,  5.59s/it]



  6%|▌         | 110/2000 [10:00<3:07:27,  5.95s/it]



  8%|▊         | 156/2000 [14:13<3:01:23,  5.90s/it]



  9%|▉         | 176/2000 [15:58<2:28:09,  4.87s/it]



  9%|▉         | 179/2000 [16:19<3:23:12,  6.70s/it]



 11%|█▏        | 225/2000 [20:38<3:12:15,  6.50s/it]



 16%|█▌        | 312/2000 [28:37<2:37:37,  5.60s/it]



 18%|█▊        | 363/2000 [33:24<2:36:55,  5.75s/it]



 20%|██        | 407/2000 [37:22<2:12:56,  5.01s/it]

In [None]:
for i in range(20):
    print(i, chunk_cliff.iloc[i]["llama_cliff_4bit_2502_ordered"])

##### Import of cliff dfs

In [4]:
output_folder = "llama_cliffs"

# Initialize lists to store loaded data
all_cliff_dfs = []  # To store DataFrames
all_error_indices = []  # To store error indices

# Get all file names in the output folder
for i in range(9):
    # Load DataFrame and append to the list
    cliff_file = os.path.join(output_folder, f'out_meta_chunk_{i+1}_cliff_2502.csv')
    error_file = os.path.join(output_folder, f'idx_error_chunk_{i+1}_cliff_2502.pkl')

    if os.path.exists(cliff_file):
        all_cliff_dfs.append(pd.read_csv(cliff_file))
    if os.path.exists(error_file):
        all_error_indices.append(pd.read_pickle(error_file))

In [5]:
# Merge all the DataFrames and lists
llama_cliff = pd.concat(all_cliff_dfs, ignore_index=True)  # Combine DataFrames
llama_error_indices =  pd.concat(all_error_indices, ignore_index=True) # Flatten error indices

In [6]:
len(llama_cliff)
len(llama_error_indices)

298

In [7]:
llama_error_indices[llama_error_indices["error"] == "no_json"]

Unnamed: 0,idx,error,column
0,70,no_json,ordered
1,111,no_json,ordered
3,291,no_json,ordered
4,334,no_json,ordered
8,471,no_json,ordered
...,...,...,...
285,12618,no_json,ordered
286,12628,no_json,ordered
287,12814,no_json,ordered
288,12871,no_json,ordered


In [39]:
llama_cliff["llama_cliff_4bit_2502_ordered"][13757]

'{"topic": "9", "subtopic": "A", "justification": "The text focuses on the Medicare Physician-Payment Equity Act, which aims to address geographic disparities in physician payment, particularly in rural areas, and improve access to healthcare in these regions."}'

In [41]:
# Save the merged results
llama_cliff.to_csv('llama_cliff_merged_2502.csv', index=False)
llama_error_indices.to_csv('llama_errors_merged_2502.csv', index=False)

In [9]:
llama_cliff.tail()

Unnamed: 0,id_,author,year,date,year_n,source,dataset,text,tokens_R,llama_cliff_4bit_2502_ordered
14453,CREC-2024-12-11-pt1-PgS6963-4-1,Mr. SANDERS,2024,2024-12-11,45,Independent,Congress,"Mr. President, as the holiday season approache...",president holiday season approach appropriate ...,"{""topic"": ""7"", ""subtopic"": ""A"", ""justification..."
14454,CREC-2024-12-12-pt1-PgH7130-2,Mr. BOWMAN,2024,2024-12-12,45,Democrat,Congress,"Mr. Speaker, I thank my sister Cori Bush for y...",speaker thank sister cori bush yield speaker w...,"{""topic"": ""7"", ""subtopic"": ""A"", ""justification..."
14455,CREC-2024-12-12-pt1-PgH7130-6,Ms. BUSH,2024,2024-12-12,45,Democrat,Congress,"Mr. Speaker, Congresswoman Ramirez came into C...",speaker congresswoman ramirez come congress li...,"{""topic"": ""1"", ""subtopic"": ""B"", ""justification..."
14456,CREC-2024-12-16-pt1-PgH7151-10,Mr. STEIL,2024,2024-12-16,45,Republican,Congress,"Mr. Speaker, I yield myself such time as I may...",speaker yield time consume speaker measure hal...,"{""topic"": ""6"", ""subtopic"": ""A"", ""justification..."
14457,CREC-2024-12-20-pt1-PgS7321-0,Mr. SANDERS,2024,2024-12-20,45,Independent,Congress,"Mr. President, I rise today to honor Hugh Espe...",president rise today honor hugh espey iowa cit...,"{""topic"": ""7"", ""subtopic"": ""A"", ""justification..."
