In [None]:
#Installation
!CT_CUBLAS=1 pip install ctransformers --no-binary ctransformers

Collecting ctransformers
  Downloading ctransformers-0.2.27.tar.gz (376 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.1/376.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: ctransformers
  Building wheel for ctransformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for ctransformers: filename=ctransformers-0.2.27-cp310-cp310-linux_x86_64.whl size=2336849 sha256=e2f4328b841d8f8b40099dfae38bf6ae2d74a3a675b5721caafbad876151c680
  Stored in directory: /root/.cache/pip/wheels/dd/54/e9/32364da8eee84a2b0b412394983c15add18816c507e90f02d8
Successfully built ctransformers
Installing collected packages: ctransformers
Successfully installed ctransformers-0.2.27


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Paths setup
repopath = '/content/drive/MyDrive/Colab Notebooks/FM_Final_Proj_Code_Repo/'
datapath = repopath + 'Generated_Datasets/'


#Dataset params
runSST2 = False #Set to false to run CARER (for our results here, we only use CARER)

#Model params
repetition_penalty = 4
temp = 0.5
top_p = 0.5

if runSST2:
  filename = "sst2GenUnfiltered.csv"

  #Data params
  n_samples = 2000 #Split evenly among classes
  label_to_label_name = {'0': 'positive', '1': 'negative'}
  label_to_prompt_label = {'0': 'positive sentiment', '1': 'negative sentiment'}
  labels_to_filter = set('0','1')
  label_to_similar_labels = {'0': ['1'], '1': ['0']}
  label_to_examples = {'0': ['This is a good movie with stunning visuals.', 'Decently engaging plot that puts a smile on your face.'], '1': ['This was a sub-par movie with so-so acting.', 'Slow pacing and dry.']}
  prompt_setup = "Movie Review: [positive text] \nDoes this movie review have [label]? The answer is yes. \n\nMovie Review: [negative text]. \nDoes this movie review have [label]? The answer is no. \n\nMovie Review: [target text]. \nDoes this movie review have [label]? The answer is "
  n_samples_per_batch = 10 #Generate n samples from the same prompt
else:
  filename = "carerGenUnfiltered.csv"

  #Data params
  n_samples = 6000
  label_to_label_name = {'0': 'sadness', '1': 'joy', '2': 'love', '3': 'anger'}
  label_to_prompt_label = {'0': 'sadness', '1': 'joy', '2': 'love', '3': 'anger'}
  labels_to_filter = {'0','3'}
  label_to_similar_labels = {'0': ['3'], '3': ['0']}
  label_to_examples = {'0': ['i feel very numb at the moment', 'i am nauseous and dizzy and feel all gloomy or at least not attached to my body anymore'],
                       '1': ['i feel contented small old rich tired and happy', 'i feel like it here are ten of the many sites that keep me entertained on a daily basis'],
                       '2': ['i feel on the verge of tears from weariness i look at your sweet face and cant help but tenderly kiss your cheeks', 'i ate i could feel a gentle tingle throughout almost as if i was feeling the healing taking place at a cellular level'],
                       '3': ['im feeling bitter today my mood has been strange the entire day so i guess its that', 'i know the pain parents feel when an enraged child becomes violent']}
  prompt_setup = "Write a short emotional tweet expressing [label]: "
  prompt_setup = "Tweet: [positive text] \nDoes this emotional tweet have [label]? The answer is yes. \n\nTweet: [negative text] \nDoes this emotional tweet have [label]? The answer is no. \n\nTweet: [target text] \nDoes this emotional tweet have [label]? The answer is "
  n_samples_per_batch = 5 #Generate n samples from the same prompt

In [None]:
#Model Setup

from ctransformers import AutoModelForCausalLM

#Model selection
model_id = "TheBloke/Llama-2-7B-chat-GGML" #Was using 13B before

# Config options: https://github.com/marella/ctransformers
#The ACL paper used a 100 token generation length to simulate the short dataset texts
config = {'max_new_tokens': 10, 'repetition_penalty': repetition_penalty,
          'temperature': temp, 'stream': True, 'top_p': top_p, 'last_n_tokens': 1000, 'seed': 20}

llm = AutoModelForCausalLM.from_pretrained(
      model_id,
      model_type="llama",
      #lib='avx2', #for cpu use
      gpu_layers=110, #110 for 7b, 130 for 13b
      **config
      )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

llama-2-7b-chat.ggmlv3.q2_K.bin:   0%|          | 0.00/2.87G [00:00<?, ?B/s]

In [None]:
#Run Validation

import csv
import os.path
import pandas as pd

def test_text_with_label(curr_text, test_label):
  #Returns true if LLM thinks curr_text has test_label
  prompt = prompt_setup.replace('[positive text]',label_to_examples[test_label][0])
  prompt = prompt.replace('[negative text]',label_to_examples[label_to_similar_labels[test_label][0]][0])
  prompt = prompt.replace('[target text]',curr_text)
  prompt = prompt.replace('[label]',label_to_prompt_label[test_label])
  response = llm(prompt, stream=False)
  if response.lower().find("yes") != -1:
    return True
  else:
    return False

df = pd.read_csv(datapath + filename)
if 'LabelName' in df.columns:
  df = df.drop(columns=['LabelName'])
#df = df[2100:2110].reset_index() #For testing
df['RemoveLabel'] = False #Set to true is label should be removed (if label can belong to 2+ classes based on LLM predictions)
df['LabelReplaced'] = False #Set to true if label replacment occurred for this entry
for i in range(0,len(df)):
  break
  if i % 100 == 0:
    df.to_csv(datapath + filename.replace('Unfiltered','FilteredCheckpoint'), index=False)
  curr_label, curr_text = str(df.loc[i, "Label"]), df.loc[i, "Text"]
  if curr_label not in labels_to_filter:
    continue
  #Check whether it matches another (similar) label
  other_label_matches = []
  for other_label in label_to_similar_labels[curr_label]:
    if test_text_with_label(curr_text, other_label):
      other_label_matches.append(other_label)
  if len(other_label_matches) == 0:
    #Doesn't conflict with another similar label
    continue
  elif len(other_label_matches) == 1:
    if test_text_with_label(curr_text, curr_label):
      #Text matches own label and another label --> remove this entry
      df.loc[i, "RemoveLabel"] = True
    else:
      #Text matches another label and doesn't match own entry --> label replacement
      df.loc[i, "LabelReplaced"] = True
      df.loc[i, "Label"] = other_label_matches[0]
  else:
    #Conflicts with >1 label --> remove this entry
    df.loc[i, "RemoveLabel"] = True
df.to_csv(datapath + filename.replace('Unfiltered','FilteredCheckpoint'), index=False)

In [None]:
#Post-processing + save info

import csv
import os.path
import pandas as pd

#Create out filtered dataframe from the checkpoint
df = pd.read_csv(datapath + filename.replace('Unfiltered','FilteredCheckpoint'))
if 'LabelName' in df.columns:
  df = df.drop(columns=['LabelName'])
df.to_csv(datapath + filename.replace('Unfiltered','FilteredCheckpoint'), index=False)
removed_labels_df = df[df["RemoveLabel"] == True]
replaced_labels_df = df[df["LabelReplaced"] == True]
df = df[df["RemoveLabel"] == False]
for label in label_to_label_name.keys():
  if int(label) in removed_labels_df['Label'].value_counts():
    print(f"removed {removed_labels_df['Label'].value_counts()[int(label)].item()} entries from {label_to_label_name[label]}")
  if int(label) in replaced_labels_df['Label'].value_counts():
    print(f"replaced {replaced_labels_df['Label'].value_counts()[int(label)].item()} entries as {label_to_label_name[label]}")
df = df.drop(columns=['RemoveLabel','LabelReplaced'])
removed_labels_df = removed_labels_df.drop(columns=['RemoveLabel','LabelReplaced'])
removed_labels_df.to_csv(datapath + filename.replace('Unfiltered','Removed'), index=False)
df.to_csv(datapath + filename.replace('Unfiltered','Filtered'), index=False)

#Balance classes by repopulating removed datapoints with a random sample from the existing data points
for label in label_to_label_name.keys():
  if int(label) in removed_labels_df['Label'].value_counts():
    n_to_add = removed_labels_df['Label'].value_counts()[int(label)].item()
    df_to_add = df[df["Label"] == int(label)].sample(n = n_to_add, replace=True)
    df = df.append(df_to_add, ignore_index=True)
df = df.sort_values(by=['Label'])
df.to_csv(datapath + filename.replace('Unfiltered','FilteredBalanced'), index=False)

removed 560 entries from sadness
replaced 30 entries as sadness
removed 574 entries from anger
replaced 18 entries as anger


  df = df.append(df_to_add, ignore_index=True)
  df = df.append(df_to_add, ignore_index=True)
