In [22]:
# alpaca 
# OpenAssistant 
# kaggle 
# Synthetic 

#### Loading dataset From alpaca -->


In [6]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca")

print(dataset)


Found cached dataset parquet (C:/Users/arun4/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [7]:
print(dataset["train"][0])



{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [8]:
keywords = [
    "data science",
    "machine learning",
    "deep learning",
    "artificial intelligence",
    "neural network",
    "statistics",
    "probability",
    "python",
    "pandas",
    "numpy",
    "sql",
    "nlp",
    "computer vision",
    "regression",
    "classification",
    "clustering",
    "model",
    "overfitting",
    "underfitting",
    "gradient descent",
    "interview",
    "data analysis",
    "feature engineering",
    "data preprocessing",
    "data visualization",
    "decision tree",
    "random forest",
    "xgboost",
    "transformer",
    "llm",
    "mlops",
    "hyperparameter"
]



In [9]:
def filter_ds_related(example):
    instruction = example["instruction"].lower()
    
    return any(keyword in instruction for keyword in keywords)


In [10]:
train_data = dataset["train"]

filtered_data = train_data.filter(filter_ds_related)


Loading cached processed dataset at C:\Users\arun4\.cache\huggingface\datasets\tatsu-lab___parquet\tatsu-lab--alpaca-2b32f0433506ef5f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-0e70526648b7e046.arrow


In [11]:
print("Original size:", len(train_data))
print("Filtered size:", len(filtered_data))


Original size: 52002
Filtered size: 2226


In [12]:
filtered_data

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2226
})

##### cleaning the filtered data set  -->
Remove noisy records

Fix formatting issues

Remove very short/very long samples

Remove bad quality answers

Standardize text

In [13]:
import re

def basic_clean(example):
    def clean_text(text):
        text = text.strip()
        text = re.sub(r"\s+", " ", text)   # remove extra spaces
        return text

    return {
        "instruction": clean_text(example["instruction"]),
        "input": clean_text(example["input"]) if example["input"] else "",
        "output": clean_text(example["output"])
    }


In [14]:
cleaned = filtered_data.map(basic_clean)


Loading cached processed dataset at C:\Users\arun4\.cache\huggingface\datasets\tatsu-lab___parquet\tatsu-lab--alpaca-2b32f0433506ef5f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-e130327abbd68cd9.arrow


In [15]:
## Remove Low Quality Records

def remove_empty(example):
    return len(example["instruction"]) > 10 and len(example["output"]) > 10

cleaned = cleaned.filter(remove_empty)


Loading cached processed dataset at C:\Users\arun4\.cache\huggingface\datasets\tatsu-lab___parquet\tatsu-lab--alpaca-2b32f0433506ef5f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-f31d3a7510bbaca9.arrow


In [16]:
## Remove Very Short or Very Long Samples

def length_filter(example):
    total_len = len(example["instruction"]) + len(example["output"])
    return 30 < total_len < 2000


In [17]:
cleaned = cleaned.filter(length_filter)


Loading cached processed dataset at C:\Users\arun4\.cache\huggingface\datasets\tatsu-lab___parquet\tatsu-lab--alpaca-2b32f0433506ef5f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-fed9665589cf2b00.arrow


In [18]:
## Remove Duplicates
import pandas as pd
from datasets import Dataset

# Convert to pandas
df = cleaned.to_pandas()

# Remove duplicate rows based on instruction + output
df = df.drop_duplicates(subset=["instruction", "output"])

# Convert back to HuggingFace dataset
cleaned_dataset = Dataset.from_pandas(df)

print("After deduplication:", len(cleaned_dataset))


After deduplication: 2190


In [19]:
## Create Final Prompt Format

def create_prompt(example):
    if example["input"]:
        text = f"""### Instruction:
{example["instruction"]}

### Input:
{example["input"]}

### Response:
{example["output"]}"""
    else:
        text = f"""### Instruction:
{example["instruction"]}

### Response:
{example["output"]}"""

    return {"text": text}

final_cleaned = cleaned_dataset.map(create_prompt)


Map:   0%|          | 0/2190 [00:00<?, ? examples/s]

In [20]:
print("Final cleaned size:", len(final_cleaned))
print(final_cleaned[0])


Final cleaned size: 2190
{'instruction': 'Render a 3D model of a house', 'input': '', 'output': '<nooutput> This type of instruction cannot be fulfilled by a GPT model.', 'text': '### Instruction:\nRender a 3D model of a house\n\n### Response:\n<nooutput> This type of instruction cannot be fulfilled by a GPT model.'}


In [21]:
## Remove Non-Textual / Non-Answerable Tasks

bad_keywords = [
    "render",
    "draw",
    "paint",
    "image of",
    "picture of",
    "3d model",
    "generate an image",
    "create a video",
    "audio file",
    "physical",
    "real world action"
]

def remove_bad_tasks(example):
    text = example["instruction"].lower()
    return not any(k in text for k in bad_keywords)


In [22]:
cleaned = final_cleaned.filter(remove_bad_tasks)

print("After removing non-text tasks:", len(cleaned))


Filter:   0%|          | 0/2190 [00:00<?, ? examples/s]

After removing non-text tasks: 2179


In [23]:
cleaned.save_to_disk("alpaca_ds_cleaned")


Saving the dataset (0/1 shards):   0%|          | 0/2179 [00:00<?, ? examples/s]

Loading open-assistant dataset -->


In [4]:
from datasets import load_dataset
import pandas as pd

# Load OpenAssistant dataset
dataset = load_dataset("OpenAssistant/oasst1")

print(dataset)


Found cached dataset parquet (C:/Users/arun4/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


In [5]:
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])


{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'parent_id': None, 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18', 'created_date': '2023-02-05T14:23:50.983374+00:00', 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.', 'role': 'prompter', 'lang': 'en', 'review_count': 3, 'review_result': True, 'deleted': False, 'rank': None, 'synthetic': False, 'model_name': None, 'detoxify': {'toxicity': 0.00044308538781479, 'severe_toxicity': 3.252684837207198e-05, 'obscene': 0.00023475120542570949, 'identity_attack': 0.0001416115992469713, 'insult': 0.00039489680784754455, 'threat': 4.075629112776369e-05, 'sexual_explicit': 2.712695459194947e-05}, 'message_tree_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'tree_state': 'ready_for_export', 'emojis': {'name': ['+1', '_skip_reply', '_skip_ranking'], 'count': [10, 1, 4]}, 'labels': {'name': [

In [6]:
roles = set(dataset["train"]["role"])
print(roles)


{'prompter', 'assistant'}


In [7]:
for i in range(10):
    print(i, dataset["train"][i]["role"])


0 prompter
1 assistant
2 prompter
3 assistant
4 prompter
5 assistant
6 prompter
7 assistant
8 assistant
9 assistant


In [10]:
data = dataset["train"]  
alpaca_style = []

for i in range(len(data) - 1):
    current = data[i]
    next_msg = data[i + 1]

    # Only take valid prompter â†’ assistant pairs
    if current["role"] == "prompter" and next_msg["role"] == "assistant":
        alpaca_style.append({
            "instruction": current["text"].strip(),
            "input": "",
            "output": next_msg["text"].strip()
        })


In [11]:
# convert to hugging face dataset 
from datasets import Dataset

oa_dataset = Dataset.from_list(alpaca_style)

print(oa_dataset)


Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 27904
})


In [12]:
keywords = [
    "data science",
    "machine learning",
    "deep learning",
    "artificial intelligence",
    "neural network",
    "statistics",
    "probability",
    "python",
    "pandas",
    "numpy",
    "sql",
    "nlp",
    "computer vision",
    "regression",
    "classification",
    "clustering",
    "model",
    "overfitting",
    "underfitting",
    "gradient descent",
    "interview",
    "data analysis",
    "feature engineering",
    "data preprocessing",
    "data visualization",
    "decision tree",
    "random forest",
    "xgboost",
    "transformer",
    "llm",
    "mlops",
    "hyperparameter"
]

filtering rows for data science domain 


In [14]:
def filter_ds_related(example):
    text = example["instruction"].lower()
    return any(keyword in text for keyword in keywords)

oa_filtered = oa_dataset.filter(filter_ds_related)

print("Rows after DS filtering:", len(oa_filtered))


Filter:   0%|          | 0/27904 [00:00<?, ? examples/s]

Rows after DS filtering: 1165


In [15]:
## basic cleaning - remove extra space and normalize text 

import re

def basic_clean(example):
    def normalize(text):
        text = text.strip()
        text = re.sub(r"\s+", " ", text)   # remove multiple spaces
        return text

    return {
        "instruction": normalize(example["instruction"]),
        "input": "",
        "output": normalize(example["output"])
    }

oa_cleaned = oa_filtered.map(basic_clean)


Map:   0%|          | 0/1165 [00:00<?, ? examples/s]

In [16]:
## remove very short or use less rows

def length_filter(example):
    return len(example["instruction"]) > 15 and len(example["output"]) > 30

oa_cleaned = oa_cleaned.filter(length_filter)

print("After length filtering:", len(oa_cleaned))


Filter:   0%|          | 0/1165 [00:00<?, ? examples/s]

After length filtering: 1139


In [17]:
## remove Non- Interview Style Content 

bad_patterns = ["joke", "story", "poem", "translate", "email", "lyrics"]

def remove_noise(example):
    text = example["instruction"].lower()
    return not any(p in text for p in bad_patterns)

oa_cleaned = oa_cleaned.filter(remove_noise)

print("After noise removal:", len(oa_cleaned))


Filter:   0%|          | 0/1139 [00:00<?, ? examples/s]

After noise removal: 1108


In [18]:
## Create Final training prompt feild 

def create_prompt(example):
    text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}"""
    return {"text": text}

oa_final = oa_cleaned.map(create_prompt)


Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [19]:
print("Final cleaned OpenAssistant rows:", len(oa_final))
print(oa_final[0])


Final cleaned OpenAssistant rows: 1108
{'instruction': 'Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?', 'input': '', 'output': 'Sure! Let\'s say you want to build a model which can distinguish between images of cats and dogs. You gather your dataset, consisting of many cat and dog pictures. Then you put them through a neural net of your choice, which produces some representation for each image, a sequence of numbers like [0.123, 0.045, 0.334, ...]. The problem is, if your model is unfamiliar with cat and dog images, these representations will be quite random. At one time a cat and a dog picture could have very similar representations (their numbers would be close to each other), while at others two cat images may be represented far apart. In simple terms, the model wouldn\'t be able to tell cats and dogs apart. This is where contrastive learning comes in. The point of contrastive learning is to take pairs of samples (in thi

In [20]:
## saving the cleaned dataset 
oa_final.save_to_disk("openassistant_ds_cleaned")


Saving the dataset (0/1 shards):   0%|          | 0/1108 [00:00<?, ? examples/s]

Loading Kaggle ml_interview_questions -->


In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="ml_interview_questions.csv")

data = dataset["train"]

print(data)


Downloading and preparing dataset csv/default to C:/Users/arun4/.cache/huggingface/datasets/csv/default-3bc42a4a125c2dfc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/arun4/.cache/huggingface/datasets/csv/default-3bc42a4a125c2dfc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'question', 'answer', 'category', 'difficulty', 'company_tags', 'topic_tags', 'answer_length'],
    num_rows: 502
})


In [None]:
## Conversion funtion - to convert in alpaca format 

def convert_to_alpaca(example):
    return {
        "instruction": example["question"].strip(),
        "input": "",
        "output": example["answer"].strip()
    }


In [5]:
kaggle_ds = data.map(
    convert_to_alpaca,
    remove_columns=data.column_names
)



Loading cached processed dataset at C:\Users\arun4\.cache\huggingface\datasets\csv\default-3bc42a4a125c2dfc\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-5594d89436c359b2.arrow


In [6]:
# Create the Text Prompt Field
def create_prompt(example):
    text = f"""### Instruction:
{example["instruction"]}

### Response:
{example["output"]}"""
    return {"text": text}

kaggle_ds = kaggle_ds.map(create_prompt)


Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Basic cleaning 


In [7]:
## Remove very small rows

def quality_filter(example):
    return len(example["instruction"]) > 10 and len(example["output"]) > 20

kaggle_ds = kaggle_ds.filter(quality_filter)

Filter:   0%|          | 0/502 [00:00<?, ? examples/s]

In [8]:
print(kaggle_ds[0])
print(kaggle_ds[1])


{'instruction': 'Describe the ConvNeXt architecture and its key innovations.', 'input': '', 'output': 'ConvNeXt is a notable deep learning architecture with important innovations. It addresses limitations of prior approaches and has been widely adopted. Key choices include its feature extraction approach, computational efficiency, and scalability. Understanding it is valuable for state-of-the-art systems.', 'text': '### Instruction:\nDescribe the ConvNeXt architecture and its key innovations.\n\n### Response:\nConvNeXt is a notable deep learning architecture with important innovations. It addresses limitations of prior approaches and has been widely adopted. Key choices include its feature extraction approach, computational efficiency, and scalability. Understanding it is valuable for state-of-the-art systems.'}
{'instruction': 'What is data augmentation and why is it important for computer vision?', 'input': '', 'output': 'Data augmentation applies random transformations to training i

In [14]:
print(len(kaggle_ds))


502


In [9]:
kaggle_ds.save_to_disk("kaggle_interview_cleaned")


Saving the dataset (0/1 shards):   0%|          | 0/502 [00:00<?, ? examples/s]

Loading some synthetic data -->

In [23]:
import pandas as pd

# Load synthetic datasets
ds1 = pd.read_csv("ds_instruction_dataset_1.csv")
ds2 = pd.read_csv("ds_instruction_dataset_2.csv")

print("Dataset 1 shape:", ds1.shape)
print("Dataset 2 shape:", ds2.shape)


Dataset 1 shape: (500, 3)
Dataset 2 shape: (500, 3)


In [24]:
print(ds1.columns)
print(ds2.columns)


Index(['instruction', 'input', 'output'], dtype='object')
Index(['instruction', 'input', 'output'], dtype='object')


In [25]:
## combining both datasets 
synthetic = pd.concat([ds1, ds2], ignore_index=True)

print("Total synthetic rows:", synthetic.shape)


Total synthetic rows: (1000, 3)


In [None]:
## creating Prompt Template

def create_prompt(row):
    return f"""### Instruction:
{row['instruction']}

### Question:
{row['input']}

### Answer:
{row['output']}
"""

synthetic["text"] = synthetic.apply(create_prompt, axis=1)


In [30]:
synthetic.head()

Unnamed: 0,instruction,input,output,text
0,How does neural networks help in data science ...,,"neural networks helps by improving accuracy, e...",### Instruction:\nHow does neural networks hel...
1,Explain the concept of MLOps.,,The concept of MLOps refers to fundamental pri...,### Instruction:\nExplain the concept of MLOps...
2,What is algorithms and why is it important?,,algorithms is a key area in data science. It h...,### Instruction:\nWhat is algorithms and why i...
3,What mistakes should be avoided in MLOps?,,Common mistakes in MLOps include poor understa...,### Instruction:\nWhat mistakes should be avoi...
4,Explain python in beginner friendly language.,,python can be understood as a way to analyze a...,### Instruction:\nExplain python in beginner f...


In [31]:
##  synthetic is currently a pandas DataFrame, not a Hugging Face Dataset.
##  coverting pandas dataframe into huggingface dataset 

from datasets import Dataset

synthetic_ds = Dataset.from_pandas(synthetic)

synthetic_ds.save_to_disk("synthetic_ds")



Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Loading all datasets -->

In [24]:
from datasets import load_from_disk, concatenate_datasets

alpaca = load_from_disk("alpaca_ds_cleaned")
oasst = load_from_disk("openassistant_ds_cleaned")
kaggle = load_from_disk("kaggle_interview_cleaned")
synthetic = load_from_disk("synthetic_ds")

print("Alpaca:", alpaca)
print("OpenAssistant:", oasst)
print("Kaggle:", kaggle)
print("Synthetic:", synthetic)


Alpaca: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2179
})
OpenAssistant: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1108
})
Kaggle: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 502
})
Synthetic: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1000
})


Before concatenating, we must standardize all datasets.

In [None]:
## Helper function 
def fix_types(dataset):
    return dataset.map(lambda x: {
        "instruction": str(x["instruction"]),
        "input": str(x["input"]),
        "output": str(x["output"]),
        "text": str(x["text"])
    })


In [28]:
alpaca = fix_types(alpaca)
oasst = fix_types(oasst)
kaggle = fix_types(kaggle)
synthetic = fix_types(synthetic)


Map:   0%|          | 0/2179 [00:00<?, ? examples/s]

Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Concatinating the datasets -->

In [29]:
from datasets import concatenate_datasets

final_dataset = concatenate_datasets([alpaca, oasst, kaggle, synthetic])

print("Final dataset size:", final_dataset)


Final dataset size: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 4789
})


In [None]:
## Shuffle the dataset 

final_dataset = final_dataset.shuffle(seed=42)


In [None]:
## Create Train / Test Split


split = final_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split["train"]
test_dataset = split["test"]

print("Train size:", train_dataset)
print("Test size:", test_dataset)


Train size: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 4310
})
Test size: Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 479
})


In [33]:
print(train_dataset.column_names)
print(test_dataset.column_names)


['instruction', 'input', 'output', 'text']
['instruction', 'input', 'output', 'text']


In [None]:
## Removing the Unnecessary columns 

train_dataset = train_dataset.remove_columns(["instruction", "input", "output"])
test_dataset = test_dataset.remove_columns(["instruction", "input", "output"])


In [35]:
print(train_dataset.column_names)
print(test_dataset.column_names)


['text']
['text']


The final train and test datasets are loaded ..

In [36]:
train_dataset.save_to_disk("train_dataset")
test_dataset.save_to_disk("test_dataset")


Saving the dataset (0/1 shards):   0%|          | 0/4310 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

Fine tuning preparation 

In [2]:
## Loading final dataset 
from datasets import load_from_disk

train_data = load_from_disk("train_dataset")
val_data = load_from_disk("test_dataset")


In [None]:
## LLama model -- meta-llama/Llama-3.2-3B

In [4]:
pip install torch torchvision torchaudio


Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.10.0-cp311-cp311-win_amd64.whl.metadata (31 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/23/19/55b28aecdc7f38df57b8eb55eb0b14a62b470ed8efeb22cdc74224df1d6a/torchvision-0.25.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torchvision-0.25.0-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/69/26/cd2aec609b4f8918e4e85e5c6a3f569bc7b5f72a7ecba3f784077102749c/torchaudio-2.10.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torchaudio-2.10.0-cp311-cp311-win_amd64.whl.metadata (6.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Obtaining dependency infor

In [2]:
import transformers
print(transformers.__version__)


  from .autonotebook import tqdm as notebook_tqdm


4.46.3


In [1]:
import sys
print(sys.executable)


c:\Users\arun4\OneDrive\Desktop\LLM\venv\Scripts\python.exe


In [1]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))


CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [1]:
import bitsandbytes
print(bitsandbytes.__version__)



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Users\arun4\OneDrive\Desktop\LLM\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Users\arun4\OneDrive\Desktop\LLM\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...
Could not find module 'c:\Users\arun4\OneDrive\Desktop\LLM\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll' (or one of its dependencies). Try using the full path with constructor syntax.
CUDA SETUP: Loading binary c:\Users\arun4\OneDrive\Desktop\LLM\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...
Could not find module 'c:\Users\arun4\OneDrive\Desktop\LLM\venv\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll' (or one of its dependencies). Try using the full path with constructor syntax.
CUDA SETUP: Loading binary c:\Users\a

RuntimeError: 
        CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment!
        If you cannot find any issues and suspect a bug, please open an issue with detals about your environment:
        https://github.com/TimDettmers/bitsandbytes/issues

In [2]:
import torch
print("Torch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)


Torch Version: 2.5.1+cu121
CUDA Available: True
CUDA Version: 12.1


Loading the model --->

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "meta-llama/Llama-3.2-3B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit quantization config (ideal for RTX 4050 6GB)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load model on GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

print("Model successfully loaded on RTX 4050!")


  from .autonotebook import tqdm as notebook_tqdm


PackageNotFoundError: No package metadata was found for bitsandbytes