In [8]:
import os
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import gradio as gr
from unsloth import FastLanguageModel  # For efficient fine-tuning

ModuleNotFoundError: No module named 'gradio'

In [11]:
# Set up GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Step 1: Load and Process Datasets ---
# Define a function to load and process datasets
def load_and_process_dataset(dataset_path):
    """
    Load a dataset from the given path and process it into a format suitable for fine-tuning.
    Each dataset should have a `column_info.csv` file describing the columns and data types.
    """
    # Check if column_info.csv exists
    column_info_path = os.path.join(dataset_path, "column_info.csv")
    if not os.path.exists(column_info_path):
        print(f"Skipping dataset {dataset_path} because column_info.csv is missing.")
        return [], []
    
    # Load column information
    column_info = pd.read_csv(column_info_path)
    
    # Find the main data file (e.g., csv file that is not column_info.csv)
    data_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv") and f != "column_info.csv"]
    if not data_files:
        print(f"No data file found in {dataset_path}.")
        return [], []
    
    # Load the main data file
    data_file_path = os.path.join(dataset_path, data_files[0])
    data = pd.read_csv(data_file_path)
    
    # Create prompt-output pairs for fine-tuning
    inputs = []
    outputs = []
    for _, row in data.iterrows():
        # Create a prompt describing the columns
        prompt = f"Generate a dataset with columns: {', '.join(column_info['Column_Name'])}."
        
        # Create the table data as a CSV string
        table_data = ",".join([str(row[col]) for col in column_info['Column_Name']])
        outputs.append(table_data)
        inputs.append(prompt)
    
    return inputs, outputs


Using device: cpu


In [23]:
import os
import pandas as pd
from datasets import Dataset

def load_and_process_dataset(dataset_path):
    """
    Load a dataset from the given path and process it into a format suitable for fine-tuning.
    Each dataset folder has a `column_info.csv` file describing the columns and data types.
    The `column_info.csv` file has the following columns:
    - Column Name: The name of the column.
    - Column Description: A description of the column.
    - Column Type: The data type of the column.
    """
    # Check if column_info.csv exists
    column_info_path = os.path.join(dataset_path, "column_info.csv")
    if not os.path.exists(column_info_path):
        print(f"Skipping dataset {dataset_path} because column_info.csv is missing.")
        return [], []
    
    # Load column information
    column_info = pd.read_csv(column_info_path)
    
    # Check if the required columns exist in column_info.csv
    required_columns = ["Column Name", "Column Description", "Column Type"]
    if not all(col in column_info.columns for col in required_columns):
        print(f"Skipping dataset {dataset_path} because column_info.csv is missing required columns: {required_columns}.")
        return [], []
    
    # Find the main data file (e.g., csv file that is not column_info.csv)
    data_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv") and f != "column_info.csv"]
    if not data_files:
        print(f"No data file found in {dataset_path}.")
        return [], []
    
    # Load the main data file
    data_file_path = os.path.join(dataset_path, data_files[0])
    data = pd.read_csv(data_file_path)
    
    # Check if all columns in column_info exist in the main data file
    missing_columns = [col for col in column_info["Column Name"] if col not in data.columns]
    if missing_columns:
        print(f"Warning: The following columns are missing in the main data file for {dataset_path}: {missing_columns}.")
        # Remove missing columns from column_info
        column_info = column_info[~column_info["Column Name"].isin(missing_columns)]
    
    # Create prompt-output pairs for fine-tuning
    inputs = []
    outputs = []
    for _, row in data.iterrows():
        # Create a prompt describing the columns
        column_descriptions = [
            f"{col_name} ({col_type}): {col_desc}"
            for col_name, col_desc, col_type in zip(
                column_info["Column Name"],
                column_info["Column Description"],
                column_info["Column Type"],
            )
        ]
        prompt = f"Generate a dataset with the following columns:\n" + "\n".join(column_descriptions)
        
        # Create the table data as a CSV string
        table_data = ",".join([str(row[col]) for col in column_info["Column Name"]])
        outputs.append(table_data)
        inputs.append(prompt)
    
    return inputs, outputs

# Load all datasets
dataset_paths = [
    "LLM_data/50K_Songs_Dataset_-_Generated_by_AI",
    "LLM_data/Bank_Transaction_Dataset_for_Fraud_Detection",
    "LLM_data/Customer_Feedback_and_Satisfaction",
    "LLM_data/Data_Science_Job",
    "LLM_data/Gym_Members_Exercise_Dataset",
    "LLM_data/IMDB_Movie_Dataset",
    "LLM_data/Loan_Approval_Classification_Dataset",
    "LLM_data/Mobile_Device_Usage_and_User_Behavior_Dataset",
    "LLM_data/New_York_Airbnb_Open_Data",
    "LLM_data/US_Election_Dataset_",
]

all_inputs = []
all_outputs = []
for path in dataset_paths:
    inputs, outputs = load_and_process_dataset(path)
    all_inputs.extend(inputs)
    all_outputs.extend(outputs)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"input": all_inputs, "output": all_outputs})

print(f"Total prompt-output pairs: {len(all_inputs)}")

Total prompt-output pairs: 302262


In [15]:
from huggingface_hub import login
login(token="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")

In [24]:
import os
import pandas as pd

def debug_dataset(dataset_path):
    """
    Debug a dataset by printing the column names in the main data file
    and comparing them with the column names in column_info.csv.
    """
    # Check if column_info.csv exists
    column_info_path = os.path.join(dataset_path, "column_info.csv")
    if not os.path.exists(column_info_path):
        print(f"Skipping dataset {dataset_path} because column_info.csv is missing.")
        return
    
    # Load column information
    column_info = pd.read_csv(column_info_path)
    
    # Find the main data file (e.g., csv file that is not column_info.csv)
    data_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv") and f != "column_info.csv"]
    if not data_files:
        print(f"No data file found in {dataset_path}.")
        return
    
    # Load the main data file
    data_file_path = os.path.join(dataset_path, data_files[0])
    data = pd.read_csv(data_file_path)
    
    # Print column names in the main data file
    print(f"Columns in {data_files[0]}: {data.columns.tolist()}")
    
    # Print column names in column_info.csv
    print(f"Columns in column_info.csv: {column_info['Column Name'].tolist()}")
    
    # Check for missing columns
    missing_columns = [col for col in column_info["Column Name"] if col not in data.columns]
    if missing_columns:
        print(f"Warning: The following columns are missing in the main data file: {missing_columns}.")

# Debug the 50K_Songs_Dataset_-_Generated_by_AI folder
debug_dataset("LLM_data/50K_Songs_Dataset_-_Generated_by_AI")

# Debug the US_Election_Dataset_ folder
debug_dataset("LLM_data/US_Election_Dataset_")

Columns in spotify_songs_dataset.csv: ['song_id', 'song_title', 'artist', 'album', 'genre', 'release_date', 'duration', 'popularity', 'stream', 'language', 'explicit_content', 'label', 'composer', 'producer', 'collaboration']
Columns in column_info.csv: ['track_name', 'artist_name', 'duration_ms', 'popularity', 'genre', 'key', 'mode', 'tempo']
Columns in US_Election_dataset_v1.csv: ['Unnamed: 0', 'county', 'state', '2020 Democrat vote raw', '2020 Democrat vote %', '2020 Republican vote raw', '2020 Republican vote %', '2020 other vote raw', '2020 other vote %', 'Population with less than 9th grade education', 'Population with 9th to 12th grade education, no diploma', 'High School graduate and equivalent', 'Some College,No Degree', 'Associates Degree', 'Bachelors Degree', 'Graduate or professional degree', 'Gini Index', 'Median income (dollars)', 'Mean income (dollars)', 'Area in square Km', 'Density per square km', 'Total Population', 'Hispanic or Latino percentage', 'NH-White percentag

In [26]:
import os
import pandas as pd
from datasets import Dataset

def load_and_process_dataset(dataset_path):
    """
    Load a dataset from the given path and process it into a format suitable for fine-tuning.
    If column_info.csv exists, use it to provide descriptions for the columns.
    If column_info.csv is missing or a column is missing in it, generate synthetic descriptions.
    """
    # Find the main data file (e.g., csv file that is not column_info.csv)
    data_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv") and f != "column_info.csv"]
    if not data_files:
        print(f"No data file found in {dataset_path}.")
        return [], []
    
    # Load the main data file
    data_file_path = os.path.join(dataset_path, data_files[0])
    data = pd.read_csv(data_file_path)
    
    # Initialize column descriptions
    column_descriptions = {}
    
    # Check if column_info.csv exists
    column_info_path = os.path.join(dataset_path, "column_info.csv")
    if os.path.exists(column_info_path):
        # Load column information
        column_info = pd.read_csv(column_info_path)
        
        # Check if the required columns exist in column_info.csv
        if "Column Name" in column_info.columns and "Column Description" in column_info.columns:
            # Map column names to their descriptions
            column_descriptions = dict(zip(column_info["Column Name"], column_info["Column Description"]))
    
    # Create prompt-output pairs for fine-tuning
    inputs = []
    outputs = []
    for _, row in data.iterrows():
        # Create a prompt describing the columns
        prompt_columns = []
        for col in data.columns:
            if col in column_descriptions:
                # Use the description from column_info.csv
                prompt_columns.append(f"{col}: {column_descriptions[col]}")
            else:
                # Generate a synthetic description
                prompt_columns.append(f"{col}: The {col} of the record.")
        
        prompt = f"Generate a dataset with the following columns:\n" + "\n".join(prompt_columns)
        
        # Create the table data as a CSV string
        table_data = ",".join([str(row[col]) for col in data.columns])
        outputs.append(table_data)
        inputs.append(prompt)
    
    return inputs, outputs

# Load all datasets
dataset_paths = [
    "LLM_data/50K_Songs_Dataset_-_Generated_by_AI",
    "LLM_data/Bank_Transaction_Dataset_for_Fraud_Detection",
    "LLM_data/Customer_Feedback_and_Satisfaction",
    "LLM_data/Data_Science_Job",
    "LLM_data/Gym_Members_Exercise_Dataset",
    "LLM_data/IMDB_Movie_Dataset",
    "LLM_data/Loan_Approval_Classification_Dataset",
    "LLM_data/Mobile_Device_Usage_and_User_Behavior_Dataset",
    "LLM_data/New_York_Airbnb_Open_Data",
    "LLM_data/US_Election_Dataset_",
]

all_inputs = []
all_outputs = []
for path in dataset_paths:
    inputs, outputs = load_and_process_dataset(path)
    all_inputs.extend(inputs)
    all_outputs.extend(outputs)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"input": all_inputs, "output": all_outputs})

print(f"Total prompt-output pairs: {len(all_inputs)}")

Total prompt-output pairs: 302262


In [None]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer
import torch

# Load the model with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-2-7b-hf",  # Replace with your model
    max_seq_length=2048,  # Adjust based on your dataset
    dtype=torch.float16,  # Use mixed precision for GPU
    load_in_4bit=True,  # Use 4-bit quantization for memory efficiency
    token="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",  # Authenticate with Hugging Face
)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Fine-tune the model with Unsloth
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",  # Directory to save the fine-tuned model
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    num_train_epochs=3,  # Number of training epochs
    save_steps=10_000,  # Save checkpoint every 10,000 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    fp16=True,  # Enable mixed precision for GPU
    logging_dir="./logs",  # Directory for logs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Fine-tuning complete and model saved to './fine_tuned_model'.")