Step 1: Install and load model

In [None]:
!pip install -q transformers accelerate bitsandbytes einops pandas

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

model_id = "openchat/openchat-3.5-0106"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto"
)

llm = pipeline("text-generation", model=model, tokenizer=tokenizer)


Step 2: Upload data

In [None]:
from google.colab import files
uploaded = files.upload()

Step 3: Load data

In [None]:
import pandas as pd

# Read without headers
df = pd.read_csv("headless_dataset.csv", header=None)
sample = df.head(20)


Step 4: Define matching function and prompts

In [None]:
def format_sample_for_prompt(df_sample):
    prompt = "You are a data analyst. Infer column names for the dataset below. The dataset has no headers.\n\n"
    prompt += "Here are the first few rows:\n\n"

    # Show data as table-like text
    for index, row in df_sample.iterrows():
        prompt += ", ".join([str(val) for val in row]) + "\n"

    prompt += "\nPlease guess what each column represents and return a list of column names that corresponds to each column of data.\n\n"
    prompt += "\nPlease only provide 1 column name to per column."
    return prompt

Step 5: Run the LLM and see if it gets the column names

In [None]:
import re

prompt = format_sample_for_prompt(sample)
response = llm(prompt, max_new_tokens=256, temperature=0.3)[0]["generated_text"]

print("🔎 Guessed column names:\n")
print(response)

# Extract numbered column names like: "1. Name", "2. Email"
column_name_list = [
    re.sub(r"^\d+\.\s*", "", line).strip()
    for line in response.split("\n")
    if re.match(r"^\d+\.\s", line)
]

print("\n✅ Converted to Python list:")
print(column_name_list)

Step 6: Save the results

In [None]:
# Only apply if the number of guessed headers matches number of columns
if len(column_name_list) == df.shape[1]:
    df.columns = column_name_list
    print("✅ Column headers applied successfully.")
else:
    print("⚠️ Column count mismatch. Skipping header assignment.")

# Export
df.to_csv("labeled_dataset.csv", index=False)

from google.colab import files
files.download("labeled_dataset.csv")
