In [1]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot
from transformers import AutoTokenizer
import scipy

In [4]:
def read_text_file(filename):
    with open(filename, 'r', encoding='utf-8-sig') as file:
        return file.read()

def split_into_chunks(text, chunk_size=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    word_count = 0
    
    i = 0
    while i < len(words):
        current_chunk.append(words[i])
        word_count += 1
        i += 1
        
        if word_count >= chunk_size:
            # Backtrack to the end of the last sentence
            while i < len(words) and not any(punct in words[i] for punct in ('.', '!', '?')):
                current_chunk.append(words[i])
                word_count += 1
                i += 1

            # Find the beginning of the next sentence
            next_chunk_start = i
            while next_chunk_start < len(words) and any(punct in words[next_chunk_start] for punct in ('.', '!', '?')):
                next_chunk_start += 1
            while next_chunk_start < len(words) and not any(punct in words[next_chunk_start] for punct in ('.', '!', '?')):
                next_chunk_start += 1
            i = next_chunk_start + 1

            # Add the current chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            word_count = 0

    # Add the last chunk if there's any left
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def write_chunks_to_files(chunks, filename, tokenizer=None, base_filename='chunk'):
    for i, chunk in enumerate(chunks):
        if tokenizer is not None:
            print("n_token", len(tokenizer.encode(chunk)))
        chunk_filename = f"{filename}_{base_filename}_{i+1}.txt"
        with open(chunk_filename, 'w', encoding='utf-8-sig') as file:
            file.write(chunk)

In [6]:
# choose model to be fine-tuned (important for tokenizer and chat template)
model = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "llama3-8b"
tokenizer = AutoTokenizer.from_pretrained(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
all_chunks = []
chunksize=5000
filenames = ["69087.txt", "72578.txt", "72600.txt", "72869.txt", "72958.txt", "72972.txt", "73017.txt", "73042.txt"]
for filename in filenames:
    text = read_text_file(filename)
    chunks = split_into_chunks(text, chunk_size=chunksize)
    all_chunks.extend(chunks)
    write_chunks_to_files(chunks, "chunks/"+filename.split(".")[0], tokenizer=tokenizer, base_filename=f"chunk_{chunksize}")

n_token 6540
n_token 6626
n_token 6763
n_token 6752
n_token 6874
n_token 7068
n_token 6914
n_token 6797
n_token 6924
n_token 6864
n_token 6850
n_token 6729
n_token 6844
n_token 5721
n_token 6412
n_token 6392
n_token 6501
n_token 6396
n_token 6422
n_token 6344
n_token 6211
n_token 6368
n_token 4596
n_token 6143
n_token 6056
n_token 6038
n_token 6083
n_token 5997
n_token 6084
n_token 6169
n_token 5959
n_token 6071
n_token 6205
n_token 6043
n_token 4869
n_token 6615
n_token 6617
n_token 6436
n_token 6648
n_token 6500
n_token 6584
n_token 6414
n_token 6074
n_token 6390
n_token 6211
n_token 6548
n_token 6381
n_token 6151
n_token 6422
n_token 6630
n_token 5907
n_token 6327
n_token 6294
n_token 6345
n_token 6214
n_token 6132
n_token 6352
n_token 6099
n_token 6218
n_token 2784
n_token 6481
n_token 6655
n_token 6503
n_token 6532
n_token 6663
n_token 6719
n_token 6624
n_token 6584
n_token 6559
n_token 6547
n_token 6653
n_token 6688
n_token 6639
n_token 6775
n_token 6578
n_token 6546
n_token 2580

In [8]:
# create a csv file with strings formatted for a particular model.

In [10]:
book_titles = {"69087":"The Murder of Roger Ackroyd",
               "72578":"Tom Swift and his talking pictures : or, The greatest invention on record",
               "72600":"The trumpeter of Krakow : A tale of the fifteenth century",
               "72869":"Meet the Tiger",
               "72958": "Hunting for hidden gold",
               "73042": "The well of loneliness",
               "72972": "Money for nothing",
              "73017": "Pomona; or, the future of English" }

use_system_prompt=True
system_prompt = "You are a helpful, respectful and honest assistant."
user_prompt = "I need you to thoroughly read and comprehend this extract from the book <booktitle>. The passage is as follows: <excerpt>"

def create_sample(book_title, excerpt, user_prompt, system_prompt, use_system_prompt, tokenizer, max_model_length):
    user_prompt = user_prompt.replace("<booktitle>",book_title).replace("<excerpt>", excerpt)
    if use_system_prompt:
        messages = [{"role": "system", "content":system_prompt},
                    {"role": "user", "content": user_prompt}]
    else:
        messages = [{"role": "user", "content": user_prompt}]
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    assert len(tokenizer.encode(full_prompt)) < max_model_length, "full prompt exceeds requested context length!"
    return full_prompt

all_prompts = []

for book_idx in book_titles.keys():
    # load chunk text
    filepaths = ["chunks/"+ f for f in os.listdir(path="chunks")]
    for fp in filepaths:
        if book_idx in fp:
            with open(fp, "r",encoding="utf-8-sig") as file:
                chunk = file.read()
            book_title = book_titles[book_idx]
            full_prompt = create_sample(book_title, chunk, user_prompt, system_prompt, use_system_prompt, tokenizer, max_model_length=8192)
            all_prompts.append(full_prompt)

In [11]:
df = pd.DataFrame(all_prompts, columns = ["text"])
df.to_csv(f"all_books_finetuning_{model_name}.csv")

In [71]:
with open("openhermes_15500.json","r") as f:
    examples = json.loads(f.read())

In [72]:
examples

[[{'role': 'user',
   'content': 'What is the real message behind the song "American Pie" by Don McLean?'},
  {'role': 'assistant',
   'content': '"American Pie" by Don McLean is a complex song with many interpretations, but the overarching theme is a commentary on the changes in American society and music during the 1960s and 1970s. \n\nThe "day the music died" refers to the tragic plane crash in 1959 that killed rock and roll musicians Buddy Holly, Ritchie Valens, and The Big Bopper. McLean was a paperboy at the time and learned of their deaths when he was delivering newspapers. This event had a profound impact on him and is considered a turning point in the history of rock and roll.\n\nThe song also references other significant events and figures from that era, including the Vietnam War, the Civil Rights Movement, the Beatles, Bob Dylan, and the Rolling Stones. \n\nOverall, "American Pie" is seen as a lament for the loss of innocence and the turbulent times that followed the idealis

# Make summaries csv dataset

Assuming you followed the steps in summaries.txt and created txt files containing the user prompt that contextualize the summaries/reviews. Summaries should be in a subfolder "summaries/"

In [12]:
book_titles = {"69087":"The Murder of Roger Ackroyd",
               "72578":"Tom Swift and his talking pictures : or, The greatest invention on record",
               "72600":"The trumpeter of Krakow : A tale of the fifteenth century",
               "72869":"Meet the Tiger",
               "72958": "Hunting for hidden gold",
               "73042": "The well of loneliness",
               "72972": "Money for nothing",
              "73017": "Pomona; or, the future of English" }

use_system_prompt=True
system_prompt = "You are a helpful, respectful and honest assistant."
#user_prompt = "I need you to thoroughly read and comprehend this extract from the book <booktitle>. The passage is as follows: <excerpt>"

def create_sample(book_title, excerpt, user_prompt, system_prompt, use_system_prompt, tokenizer, max_model_length):
    user_prompt = excerpt #user_prompt.replace("<booktitle>",book_title).replace("<excerpt>", excerpt)
    if use_system_prompt:
        messages = [{"role": "system", "content":system_prompt},
                    {"role": "user", "content": user_prompt}]
    else:
        messages = [{"role": "user", "content": user_prompt}]
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    assert len(tokenizer.encode(full_prompt)) < max_model_length, "full prompt exceeds requested context length!"
    return full_prompt

all_prompts = []

for book_idx in book_titles.keys():
    # load chunk text
    filepaths = ["summaries/"+ f for f in os.listdir(path="summaries")]
    for fp in filepaths:
        if book_idx in fp:
            with open(fp, "r",encoding="utf-8-sig") as file:
                chunk = file.read()
            book_title = book_titles[book_idx]
            full_prompt = create_sample(book_title, chunk, user_prompt, system_prompt, use_system_prompt, tokenizer, max_model_length=8192)
            all_prompts.append(full_prompt)

df = pd.DataFrame(all_prompts, columns = ["text"])
df.to_csv(f"all_summaries_finetuning_{model_name}.csv")