In [1]:
import os
from transformers import AutoTokenizer, LlamaForCausalLM
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import glob

load_dotenv()

# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=os.getenv("HF_TOKEN"))
# model = LlamaForCausalLM.from_pretrained(MODEL_NAME, token=os.getenv("HF_TOKEN"))


True

In [2]:
"""
df: dataframe
path: str path the store df
"""
def to_csv(df, path):
    return df.to_csv(path)

def to_html(df, path):
    return df.to_html(path)

def to_tsv(df, path):
    return df.to_csv(path, sep='\t')
    
"""
df_type: data frame type to be returned any of (csv, html, tsv)
task: QA task to be performed any of (arithmetic, item)
row_size: row size of the dataset
col_size: col size of the dataset
file_name: name of file
"""
def generate_dataset(df_type, task, row_size, col_size, file_name):
    columns = ['Col ' + str(i+1) for i in range(col_size)]
    rows = ['Row ' + str(i+1) for i in range(row_size)]
    if task == 'arithmetic': 
        df = pd.DataFrame(np.random.randint(1, 11, size=(row_size, col_size)), columns=columns, index=rows)
    elif task == 'item':
        df = pd.DataFrame(np.random.choice(list(string.ascii_uppercase), size=(row_size, col_size)), columns=columns, index=rows)
    df_type_dict = {'csv': to_csv, 'html': to_html, 'tsv': to_tsv}
    path = '../datasets/tables/' + file_name + '.' + df_type 
    return df_type_dict[df_type](df, path)

In [3]:
N = [4,6,8,10,12]

for n in N:
    generate_dataset('tsv', 'arithmetic', n, n, 'arithmetic_'+str(n))
    generate_dataset('csv', 'arithmetic', n, n, 'arithmetic_'+str(n))
    generate_dataset('html', 'arithmetic', n, n, 'arithmetic_'+str(n))

    generate_dataset('tsv', 'item', n, n, 'item_'+str(n))
    generate_dataset('csv', 'item', n, n, 'item_'+str(n))
    generate_dataset('html', 'item', n, n, 'item_'+str(n))

In [4]:
path = '../datasets/tables/*'
file_paths = glob.glob(path)

"""
A qa.csv file contain all the questions and their corresponding context and answer
question: str
answer: str
context: str (only the name of the table)
id: unique str
task: optional enum (“arithmetic” and “list-item”)
direction: optional enum (“row” or “col”)
size: optional tuple[int] 
"""
def get_question():
    #TODO
    return ""

def get_answer():
    #TODO
    return ""

def get_context(file_path):
    idx = file_path.find('tables')
    return file_path[idx:]

def get_id(i):
    return "nt-" + str(i)

def get_task(df):
    #TOOD
    series = df.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())[1:]
    if (series.all()):
        return 'arithmetic'
    return 'item'

def get_direction():
    #TODO
    return ""

def get_size(df):
    reshaped = (df.shape[0], df.shape[1]-1)
    return str(reshaped)

def read_html():
    #TOOD
    return
    
def tables_to_dataset():
    dataset_df = pd.DataFrame(columns=['question', 'answer', 'context', 'id', 'task', 'direction', 'size'])
    for i in range(len(file_paths)):
        file_path = file_paths[i]
        if (file_path.endswith('.csv')):
            df = pd.read_csv(file_path)
        elif (file_path.endswith('.tsv')):
            df = pd.read_csv(file_path, sep='\t')
        elif (file_path.endswith('.html')):
            #TODO
            pass
        file_name = file_path.split('/')[-1]
        dataset_df.loc[i] = [get_question(), get_answer(), get_context(file_path), get_id(i), get_task(df), get_direction(), get_size(df)]
    dataset_path = '../datasets/self_generated_qa.csv'
    return dataset_df.to_csv(dataset_path, index=False)

In [5]:
# WIKI table example
# TODO: .examples?
wiki_path = '../datasets/data/*.tsv'
wiki_file_paths = glob.glob(wiki_path)

def wiki_tables_to_dataset():
    dataset_df = pd.DataFrame(columns=['question', 'answer', 'context', 'id', 'task', 'direction', 'size'])
    for i in range(len(wiki_file_paths)):
        file_path = wiki_file_paths[i]
        df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip')
        dataset_df['question'] = df['utterance']
        dataset_df['answer'] = df['targetValue']
        dataset_df['context'] = df['context']#.apply(lambda x: x.split('/'[-1:]))
        dataset_df['id'] = df['id']
    dataset_path = '../datasets/wiki_qa.csv'
    return dataset_df.to_csv(dataset_path, index=False)


def merge_tables_and_wiki_tables():
    dataset_path = '../datasets/qa.csv'
    self_generated_path = '../datasets/self_generated_qa.csv'
    wiki_dataset_path = "../datasets/wiki_qa.csv"
    df =  pd.read_csv(self_generated_path)
    wiki_df =  pd.read_csv(wiki_dataset_path)
    merged_df = pd.concat([df, wiki_df], ignore_index=True)
    return merged_df.to_csv(dataset_path, index=False)

In [6]:
tables_to_dataset()

In [7]:
wiki_tables_to_dataset()

In [8]:
merge_tables_and_wiki_tables()