In [1]:
def load_files_into_string(file_list):
    """
    Load text files from a list into a single string.

    Args:
        file_list (list): List of file paths.

    Returns:
        str: Contents of all files concatenated into a single string.
    """
    concatenated_string = ""
    for file_path in file_list:
        try:
            with open(file_path, 'r') as file:
                file_contents = file.read()
                concatenated_string += file_contents
        except IOError:
            print("Error: Unable to read file -", file_path)
            continue
    return concatenated_string


In [3]:
# Example usage:
file_list = ["train_questions.txt", "train_questions_1.txt", "train_questions_2.txt","train_questions_3.txt"]
all_text = load_files_into_string(file_list)

In [10]:
data = all_text.strip().split('\n')

In [66]:
import json

def clean_and_load_to_list(string_data):
    """
    Clean the string containing JSON objects and load them into a list.

    Args:
        string_data (str): String containing JSON objects.

    Returns:
        list: List of dictionaries containing cleaned data.
    """
    question = []
    instruction = []
    answer = []
    
    
    cleaned_list = []
    # Split the string by newline character to separate individual JSON objects
    json_objects = string_data.strip().split('\n')
    for json_str in json_objects:
        try:
            # Parse each JSON object
            json_data = json.loads(json_str)
            # Extract required fields
            question = json_data.get('question', 'N/A')
            table = json_data.get('table', 'N/A')
            answer = json_data.get('answer', 'N/A')
            # clause = json_data.get('clause', 'N/A')
            
            # Append cleaned data to the list
            # cleaned_list.append({'text':f'### Human: {question} Here is the Data : {table} ### Assistant: {answer}'})
            
            cleaned_list.append({"input_text":f'Question: {question} Here is the Data : {table}', "output_text": f'{answer}'})
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}. Skipping the current entry.")
    return cleaned_list

# Example usage:
cleaned_data = clean_and_load_to_list(all_text)
# print(cleaned_data)


Error decoding JSON: Expecting property name enclosed in double quotes: line 1 column 134 (char 133). Skipping the current entry.
Error decoding JSON: Extra data: line 1 column 8 (char 7). Skipping the current entry.
Error decoding JSON: Extra data: line 1 column 9 (char 8). Skipping the current entry.
Error decoding JSON: Extra data: line 1 column 9 (char 8). Skipping the current entry.


In [68]:
cleaned_data[0]

{'input_text': 'Question: How are long-lived assets assessed for recoverability according to the provided financial data? Here is the Data : | Fiscal Year | Research and Development Costs | Advertising Costs | Provision for Doubtful Accounts |\n|---|---|---|---|\n| 2021 | $1,118,320 | $171,883 | $6,199 |\n| 2020 | $870,611 | $57,658 | $147 |\n| 2019 | $799,734 | $85,521 | $974 |',
 'output_text': 'Long-lived assets or groups of assets are assessed based on a comparison of the carrying amount to the estimated future net cash flows. If estimated future undiscounted net cash flows are less than the carrying amount, the asset is considered impaired and a loss is recorded. Intangible assets with finite lives are generally amortized using the straight-line method over their estimated economic useful lives.'}

In [48]:
cleaned_data[:2]

[{'text': '### Human: How are long-lived assets assessed for recoverability according to the provided financial data? Here is the Data : | Fiscal Year | Research and Development Costs | Advertising Costs | Provision for Doubtful Accounts |\n|---|---|---|---|\n| 2021 | $1,118,320 | $171,883 | $6,199 |\n| 2020 | $870,611 | $57,658 | $147 |\n| 2019 | $799,734 | $85,521 | $974 | ### Assistant: Long-lived assets or groups of assets are assessed based on a comparison of the carrying amount to the estimated future net cash flows. If estimated future undiscounted net cash flows are less than the carrying amount, the asset is considered impaired and a loss is recorded. Intangible assets with finite lives are generally amortized using the straight-line method over their estimated economic useful lives.'},
 {'text': '### Human: What is the question you would ask regarding the provided financial data? Here is the Data : | Three Months Ended | North America Revenue (USD) | Europe Revenue (USD) | Gr

In [50]:
file_path = "data.jsonl"
with open(file_path, 'w') as file:
    for item in cleaned_data:
        json.dump(item, file)
        file.write('\n')

In [51]:
!gsutil cp data.jsonl gs://19865_finetuned_models/training_data/

Copying file://data.jsonl [Content-Type=application/octet-stream]...
/ [1 files][ 95.1 KiB/ 95.1 KiB]                                                
Operation completed over 1 objects/95.1 KiB.                                     


In [52]:
from datasets import Dataset
dataset = Dataset.from_list(cleaned_data)

In [57]:
dataset.push_to_hub("stevhliu/processed_demo")dataset.push_to_hub("stevhliu/processed_demo")


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/opt/conda/bin/huggingface-cl

In [61]:
from datasets import load_dataset

data = load_dataset('ariji1/acn-finetuning')

In [62]:
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 154
    })
})

In [60]:
load_dataset('timdettmers/openassistant-guanaco')

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9846
    })
    test: Dataset({
        features: ['text'],
        num_rows: 518
    })
})

In [44]:
# path = gs://19865_finetuned_models/training_data/data.jsonl

In [63]:
import json

def read_jsonl(file_path):
    """
    Read a JSONL file and return a list of dictionaries.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        list: List of dictionaries.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

# Example usage:
file_path = "data.jsonl"
jsonl_data = read_jsonl(file_path)
print(jsonl_data[:2])


[{'text': '### Human: How are long-lived assets assessed for recoverability according to the provided financial data? Here is the Data : | Fiscal Year | Research and Development Costs | Advertising Costs | Provision for Doubtful Accounts |\n|---|---|---|---|\n| 2021 | $1,118,320 | $171,883 | $6,199 |\n| 2020 | $870,611 | $57,658 | $147 |\n| 2019 | $799,734 | $85,521 | $974 | ### Assistant: Long-lived assets or groups of assets are assessed based on a comparison of the carrying amount to the estimated future net cash flows. If estimated future undiscounted net cash flows are less than the carrying amount, the asset is considered impaired and a loss is recorded. Intangible assets with finite lives are generally amortized using the straight-line method over their estimated economic useful lives.'}, {'text': '### Human: What is the question you would ask regarding the provided financial data? Here is the Data : | Three Months Ended | North America Revenue (USD) | Europe Revenue (USD) | Gro

In [64]:
dataset = load_dataset("ariji1/acn-finetuning", split="train")

In [65]:
dataset

Dataset({
    features: ['text'],
    num_rows: 154
})