In [3]:
import os
from transformers import AutoTokenizer, LlamaForCausalLM
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

load_dotenv()

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=os.getenv("HF_TOKEN"))
model = LlamaForCausalLM.from_pretrained(MODEL_NAME, token=os.getenv("HF_TOKEN"))


In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [6]:
"""
Generate the dataset -> CSV / TSV / … etc formats 
Size of the table 
Across various sizes / dims of the table (N x N, N x M etc)
Questions / Answers per tables?
Column operations / Row operations (1 : 1) 
Tasks
Arithmetic
Sum / Min / Max for a row / col
Listing
Just list Row / Col 
Etc 
"""

'\nGenerate the dataset -> CSV / TSV / … etc formats \nSize of the table \nAcross various sizes / dims of the table (N x N, N x M etc)\nQuestions / Answers per tables?\nColumn operations / Row operations (1 : 1) \nTasks\nArithmetic\nSum / Min / Max for a row / col\nListing\nJust list Row / Col \nEtc \n'

In [7]:
"""
df: dataframe
path: str path the store df
"""
def to_csv(df, path):
    return df.to_csv(path)

def to_html(df, path):
    return df.to_html(path)

def to_tsv(df, path):
    return df.to_csv(path, sep='\t')
    
"""
df_type: data frame type to be returned any of (csv, html, table, tsv)
task: QA task to be performed any of (arithmetic, item)
row_size: row size of the dataset
col_size: col size of the dataset
file_name: name of file
"""
def generate_dataset(df_type, task, row_size, col_size, file_name):
    columns = ['Col ' + str(i+1) for i in range(col_size)]
    rows = ['Row ' + str(i+1) for i in range(row_size)]
    if task == 'arithmetic': 
        df = pd.DataFrame(np.random.randint(0, 100, size=(row_size, col_size)), columns=columns, index=rows)
    elif task == 'item':
        df = pd.DataFrame(np.random.choice(list(string.ascii_uppercase), size=(row_size, col_size)), columns=columns, index=rows)
    df_type_dict = {'csv': to_csv, 'html': to_html, 'tsv': to_tsv}
    path = './generated_data/' + file_name + '.' + df_type 
    return df_type_dict[df_type](df, path)

In [8]:
generate_dataset('tsv', 'arithmetic', 100, 100, 'out_arithmetic')
generate_dataset('csv', 'arithmetic', 100, 100, 'out_arithmetic')
generate_dataset('html', 'arithmetic', 100, 100, 'out_arithmetic')

In [9]:
generate_dataset('tsv', 'item', 100, 100, 'out_item')
generate_dataset('csv', 'item', 100, 100, 'out_item')
generate_dataset('html', 'item', 100, 100, 'out_item')

In [10]:
generate_dataset('csv', 'item', 50, 100, 'out_item_2')
generate_dataset('csv', 'item', 10, 5, 'out_item_3')