In [None]:
#Installation
!CT_CUBLAS=1 pip install ctransformers --no-binary ctransformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Paths setup
repopath = '/content/drive/MyDrive/Colab Notebooks/FM_Final_Proj_Code_Repo/'
datapath = repopath + 'Generated_Datasets/'


#Dataset params
runSST2 = False #Set to false to run CARER generation

if runSST2:
  filename = "sst2Gen.csv"

  #Model params
  repetition_penalty = 4
  temp = 1.6
  top_p = 0.9

  #Data params
  n_samples = 2000 #Split evenly among classes
  label_to_label_name = {'0': 'positive', '1': 'negative'}
  label_to_prompt_label = {'0': 'positive sentiment', '1': 'negative sentiment'}
  label_to_examples = {'0': ['This is a good movie with stunning visuals.', 'Decently engaging plot that puts a smile on your face.'], '1': ['This was a sub-par movie with so-so acting.', 'Slow pacing and dry.']}
  prompt_setup = "Write a short movie review with [label]. \n Movie Review: "
  n_samples_per_batch = 10 #Generate n samples from the same prompt
else:
  filename = "carerGen.csv"

  #Model params
  repetition_penalty = 4
  temp = 1#2#1.3
  top_p = 0.9

  #Data params
  n_samples = 4000
  label_to_label_name = {'0': 'sadness', '1': 'joy', '2': 'love', '3': 'anger'}
  label_to_prompt_label = {'0': 'sadness', '1': 'joy', '2': 'love', '3': 'anger'}
  label_to_examples = {'0': ['i feel very numb at the moment', 'i am nauseous and dizzy and feel all gloomy or at least not attached to my body anymore'],
                       '1': ['i feel contented small old rich tired and happy', 'i feel like it here are ten of the many sites that keep me entertained on a daily basis'],
                       '2': ['i feel on the verge of tears from weariness i look at your sweet face and cant help but tenderly kiss your cheeks', 'i ate i could feel a gentle tingle throughout almost as if i was feeling the healing taking place at a cellular level'],
                       '3': ['i know the pain parents feel when an enraged child becomes violent', 'im feeling bitter today my mood has been strange the entire day so i guess its that']}

  prompt_setup = "Write a short emotional tweet expressing [label]: "
  n_samples_per_batch = 5 #Generate n samples from the same prompt

In [None]:
#Model Setup

from ctransformers import AutoModelForCausalLM

#Model selection
model_id = "TheBloke/Llama-2-7B-chat-GGML" #Was using 13B before

# Config options: https://github.com/marella/ctransformers
#The ACL paper used a 100 token generation length to simulate the short dataset texts
config = {'max_new_tokens': 100, 'repetition_penalty': repetition_penalty,
          'temperature': temp, 'stream': True, 'top_p': top_p, 'last_n_tokens': 1000, 'seed': 20}

llm = AutoModelForCausalLM.from_pretrained(
      model_id,
      model_type="llama",
      #lib='avx2', #for cpu use
      gpu_layers=110, #110 for 7b, 130 for 13b
      **config
      )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#CSV Setup

import csv
import os.path

#Only set up once
if not os.path.isfile(datapath + filename):
  print("Creating new opt file")
  fields = ['Label', 'LabelName', 'Text']
  data = []
  for label in label_to_examples.keys():
    for example in label_to_examples[label]:
      data.append([label, label_to_label_name[label], example])
  with open(datapath + filename, 'w') as csvfile:
      csvwriter = csv.writer(csvfile)
      csvwriter.writerow(fields)
      csvwriter.writerows(data)

In [None]:
#Generate new samples until our capacity is hit

import random
import pandas as pd
import nltk
from nltk.translate.bleu_score import SmoothingFunction

random.seed(20)

n_few_shot_examples = 2
n_labels = len(label_to_label_name.keys())

allowed_chars = ['.', '/', '(', ')', '!', '?', ':', ' ', '#'] + [chr(i + ord('0')) for i in range(0,10)]
print(allowed_chars)
for i in range(ord('a'), ord('z') + 1):
  allowed_chars.append(chr(i))
  allowed_chars.append(chr(i + ord('A') - ord('a')))

def longestCommonSubstr(ipt_strs, target_str):
  #Find the longest common substring between any of the input strings and the target
  def LCSubStr(X, Y, m, n):
    #From https://www.geeksforgeeks.org/longest-common-substring-dp-29/
    LCSuff = [[0 for k in range(n+1)] for l in range(m+1)]
    result = 0
    for i in range(m + 1):
        for j in range(n + 1):
            if (i == 0 or j == 0):
                LCSuff[i][j] = 0
            elif (X[i-1] == Y[j-1]):
                LCSuff[i][j] = LCSuff[i-1][j-1] + 1
                result = max(result, LCSuff[i][j])
            else:
                LCSuff[i][j] = 0
    return result
  longest = 0
  for ipt_str in ipt_strs:
    longest = max(longest, LCSubStr(ipt_str, target_str, len(ipt_str), len(target_str)))
  return longest


for label in label_to_label_name.keys():
  while True:
    #Pull historically generated prompts
    curr_df = pd.read_csv(datapath + filename)
    curr_label_df = curr_df[curr_df['Label'] == int(label)]
    if len(curr_label_df) >= n_samples / n_labels:
      #We've generated enough data for this class
      break

    #For prompt setup, pull two random prior examples for few show learning
    few_shot_examples = []
    for idx in random.sample(range(len(curr_label_df)), n_few_shot_examples):
      few_shot_examples.append(curr_label_df['Text'].iloc[idx])

    #Setup Prompt
    prompt = ""
    for i in range(n_few_shot_examples):
      prompt += prompt_setup.replace('[label]',label_to_prompt_label[label]) + "\"" + few_shot_examples[i] + "\"\n-----\n"
    prompt += prompt_setup.replace('[label]',label_to_prompt_label[label])
    print("PROMPT:")
    print(prompt)

    #Run LLM to get new data
    opt = []
    for i in range(0,n_samples_per_batch):
      new_example_uncleaned = llm(prompt, stream=False)
      if len(new_example_uncleaned) == 0:
        #Generation failed
        continue
      #Remove special tokens
      new_example = []
      for ch in new_example_uncleaned:
        if ch in allowed_chars:
          new_example.append(ch)
      new_example_uncleaned = "".join(new_example)
      new_example = "".join(new_example)
      #Skip entries that are too similar
      skip_entry = False
      if len(opt) > 0 and (longestCommonSubstr(opt, new_example) > 0.25 * len(new_example) or nltk.translate.bleu_score.sentence_bleu(opt, new_example, [1/2, 1/2], SmoothingFunction().method1) < 0.3):
        #Too similar to prior results of this batch
        skip_entry = True
      if not skip_entry:
        if len(new_example) > 200:
          opt.append(new_example[:200])
        else:
          opt.append(new_example)
        print("EXAMPLE:")
        print(opt[-1])

    data = []
    for i in range(0,len(opt)):
      data.append([label, label_to_label_name[label], opt[i]])

    #Save new results
    with open(datapath + filename, 'a') as csvfile:
      csvwriter = csv.writer(csvfile)
      csvwriter.writerows(data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-----
Write a short emotional tweet expressing anger: 
EXAMPLE:
Attn Teachers! Whats up education system for neglect of artsmusic subjects.They think u are teaching note worthy topics in this generation rather than these children need attention from schools as it 
EXAMPLE:
 When my ex cheated it was like every betrayal inflicted agony on me! Now he thinks its ok for him to smile in our presence while secretly knowing that their lifes over #NoSadFaceJusForevaInHistory Bo
PROMPT:
Write a short emotional tweet expressing anger: " The only person to talk smack about someone elses accomplishment should have never achieved something substantial. Smak!!!!! Have I ever mentioned they be nothing more than noise."
-----
Write a short emotional tweet expressing anger: " Tears were cried in Apples#NewHampshire .Why could she simply walk on down but then yher boyfriodg came from NOWHERE.. just hugged her soo tightno word and them kept

In [21]:
#Remose seeded entries from dataset: run only once
curr_df = pd.read_csv(datapath + filename)
curr_df = curr_df.drop([i for i in range(1,len(label_to_label_name)*2)])
curr_df.to_csv(datapath + filename, index=False)