Step 1: Install and load model

In [None]:
!pip install -q transformers accelerate bitsandbytes einops pandas

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

model_id = "openchat/openchat-3.5-0106"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto"
)

llm = pipeline("text-generation", model=model, tokenizer=tokenizer)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Device set to use cuda:0


Step 2: Upload data

In [None]:
from google.colab import files
uploaded = files.upload()

Saving headless_dataset.csv to headless_dataset.csv


Step 3: Load data

In [None]:
import pandas as pd

# Read without headers
df = pd.read_csv("headless_dataset.csv", header=None)
sample = df.head(20)


Step 4: Define matching function and prompts

In [None]:
def format_sample_for_prompt(df_sample):
    prompt = "You are a data analyst. Infer column names for the dataset below. The dataset has no headers.\n\n"
    prompt += "Here are the first few rows:\n\n"

    # Show data as table-like text
    for index, row in df_sample.iterrows():
        prompt += ", ".join([str(val) for val in row]) + "\n"

    prompt += "\nPlease guess what each column represents and return a list of column names that corresponds to each column of data.\n\n"
    prompt += "\nPlease only provide 1 column name to per column."
    return prompt

Step 5: Run the LLM and see if it gets the column names

In [None]:
import re

prompt = format_sample_for_prompt(sample)
response = llm(prompt, max_new_tokens=256, temperature=0.3)[0]["generated_text"]

print("🔎 Guessed column names:\n")
print(response)

# Extract numbered column names like: "1. Name", "2. Email"
column_name_list = [
    re.sub(r"^\d+\.\s*", "", line).strip()
    for line in response.split("\n")
    if re.match(r"^\d+\.\s", line)
]

print("\n✅ Converted to Python list:")
print(column_name_list)

🔎 Guessed column names:

You are a data analyst. Infer column names for the dataset below. The dataset has no headers.

Here are the first few rows:

Ryan Kennedy, angela87@simpson.com, 551-411-6317, 2/13/1962, Paramedic, 132825.02, 2
Julian Torres, donna35@joseph-duncan.org, 408.002.8737, 2/24/1959, Ship broker, 32707.39, 5
Debra Baker, garrettjames@medina-meyer.biz, 067.631.7464, 8/22/1984, Office manager, 80992.85, 4
Timothy Keith, maryfrench@gmail.com, (712)250-5215, 6/5/1991, Engineer, civil (contracting), 102549.37, 3
Sara Johnson, pcaldwell@yahoo.com, 281-370-7097, 6/18/1977, Engineer, water, 64802.3, 2
Noah Hogan, vberry@yahoo.com, 503.105.1313x897, 6/25/1989, Naval architect, 56881.13, 2
Rick Collins, zacharysimmons@jackson.com, +1-426-230-6584x23684, 2/21/1990, Editor, commissioning, 103526.13, 4
Christopher Rogers, armstrongmario@gmail.com, 704-417-2925, 9/14/1947, Designer, multimedia, 96027.05, 3
Kimberly Rodriguez, mmurray@hotmail.com, (240)341-0672x015, 8/21/1974, Food t

Step 6: Save the results

In [None]:
# Only apply if the number of guessed headers matches number of columns
if len(column_name_list) == df.shape[1]:
    df.columns = column_name_list
    print("✅ Column headers applied successfully.")
else:
    print("⚠️ Column count mismatch. Skipping header assignment.")

# Export
df.to_csv("labeled_dataset.csv", index=False)

from google.colab import files
files.download("labeled_dataset.csv")


✅ Column headers applied successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>