In [1]:
# alpaca 
# OpenAssistant 

#### Loading dataset


In [3]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca")

print(dataset)


Found cached dataset parquet (C:/Users/arun4/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [None]:
print(dataset["train"][0])



{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [None]:
keywords = [
    "data science",
    "machine learning",
    "deep learning",
    "artificial intelligence",
    "neural network",
    "statistics",
    "probability",
    "python",
    "pandas",
    "numpy",
    "sql",
    "nlp",
    "computer vision",
    "regression",
    "classification",
    "clustering",
    "model",
    "overfitting",
    "underfitting",
    "gradient descent",
    "interview",
    "data analysis",
    "feature engineering",
    "data preprocessing",
    "data visualization",
    "decision tree",
    "random forest",
    "xgboost",
    "transformer",
    "llm",
    "mlops",
    "hyperparameter"
]



In [None]:
def filter_ds_related(example):
    instruction = example["instruction"].lower()
    
    return any(keyword in instruction for keyword in keywords)


In [None]:
train_data = dataset["train"]

filtered_data = train_data.filter(filter_ds_related)


Filter:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
print("Original size:", len(train_data))
print("Filtered size:", len(filtered_data))


Original size: 52002
Filtered size: 2226


In [None]:
filtered_data

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2226
})

##### cleaning the filtered data set  -->
Remove noisy records

Fix formatting issues

Remove very short/very long samples

Remove bad quality answers

Standardize text

In [None]:
import re

def basic_clean(example):
    def clean_text(text):
        text = text.strip()
        text = re.sub(r"\s+", " ", text)   # remove extra spaces
        return text

    return {
        "instruction": clean_text(example["instruction"]),
        "input": clean_text(example["input"]) if example["input"] else "",
        "output": clean_text(example["output"])
    }


In [None]:
cleaned = filtered_data.map(basic_clean)


Map:   0%|          | 0/2226 [00:00<?, ? examples/s]

In [None]:
## Remove Low Quality Records

def remove_empty(example):
    return len(example["instruction"]) > 10 and len(example["output"]) > 10

cleaned = cleaned.filter(remove_empty)


Filter:   0%|          | 0/2190 [00:00<?, ? examples/s]

In [None]:
## Remove Very Short or Very Long Samples

def length_filter(example):
    total_len = len(example["instruction"]) + len(example["output"])
    return 30 < total_len < 2000


In [None]:
cleaned = cleaned.filter(length_filter)


Filter:   0%|          | 0/2198 [00:00<?, ? examples/s]

In [None]:
## Remove Duplicates
import pandas as pd
from datasets import Dataset

# Convert to pandas
df = cleaned.to_pandas()

# Remove duplicate rows based on instruction + output
df = df.drop_duplicates(subset=["instruction", "output"])

# Convert back to HuggingFace dataset
cleaned_dataset = Dataset.from_pandas(df)

print("After deduplication:", len(cleaned_dataset))


After deduplication: 2190


In [None]:
## Create Final Prompt Format

def create_prompt(example):
    if example["input"]:
        text = f"""### Instruction:
{example["instruction"]}

### Input:
{example["input"]}

### Response:
{example["output"]}"""
    else:
        text = f"""### Instruction:
{example["instruction"]}

### Response:
{example["output"]}"""

    return {"text": text}

final_cleaned = cleaned.map(create_prompt)


Map:   0%|          | 0/2190 [00:00<?, ? examples/s]

In [None]:
print("Final cleaned size:", len(final_cleaned))
print(final_cleaned[0])


Final cleaned size: 2190
{'instruction': 'Render a 3D model of a house', 'input': '', 'output': '<nooutput> This type of instruction cannot be fulfilled by a GPT model.', 'text': '### Instruction:\nRender a 3D model of a house\n\n### Response:\n<nooutput> This type of instruction cannot be fulfilled by a GPT model.'}


In [None]:
## Remove Non-Textual / Non-Answerable Tasks

bad_keywords = [
    "render",
    "draw",
    "paint",
    "image of",
    "picture of",
    "3d model",
    "generate an image",
    "create a video",
    "audio file",
    "physical",
    "real world action"
]

def remove_bad_tasks(example):
    text = example["instruction"].lower()
    return not any(k in text for k in bad_keywords)


In [None]:
cleaned = cleaned.filter(remove_bad_tasks)

print("After removing non-text tasks:", len(cleaned))


Filter:   0%|          | 0/2190 [00:00<?, ? examples/s]

After removing non-text tasks: 2179


In [None]:
dataset.save_to_disk("alpaca_ds_fully_cleaned")


Saving the dataset (0/1 shards):   0%|          | 0/52002 [00:00<?, ? examples/s]

Loading open-assistant dataset -->


In [4]:
from datasets import load_dataset
import pandas as pd

# Load OpenAssistant dataset
dataset = load_dataset("OpenAssistant/oasst1")

print(dataset)


Found cached dataset parquet (C:/Users/arun4/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


In [5]:
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])


{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'parent_id': None, 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18', 'created_date': '2023-02-05T14:23:50.983374+00:00', 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.', 'role': 'prompter', 'lang': 'en', 'review_count': 3, 'review_result': True, 'deleted': False, 'rank': None, 'synthetic': False, 'model_name': None, 'detoxify': {'toxicity': 0.00044308538781479, 'severe_toxicity': 3.252684837207198e-05, 'obscene': 0.00023475120542570949, 'identity_attack': 0.0001416115992469713, 'insult': 0.00039489680784754455, 'threat': 4.075629112776369e-05, 'sexual_explicit': 2.712695459194947e-05}, 'message_tree_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'tree_state': 'ready_for_export', 'emojis': {'name': ['+1', '_skip_reply', '_skip_ranking'], 'count': [10, 1, 4]}, 'labels': {'name': [

In [6]:
roles = set(dataset["train"]["role"])
print(roles)


{'prompter', 'assistant'}


In [7]:
for i in range(10):
    print(i, dataset["train"][i]["role"])


0 prompter
1 assistant
2 prompter
3 assistant
4 prompter
5 assistant
6 prompter
7 assistant
8 assistant
9 assistant


In [10]:
data = dataset["train"]  
alpaca_style = []

for i in range(len(data) - 1):
    current = data[i]
    next_msg = data[i + 1]

    # Only take valid prompter â†’ assistant pairs
    if current["role"] == "prompter" and next_msg["role"] == "assistant":
        alpaca_style.append({
            "instruction": current["text"].strip(),
            "input": "",
            "output": next_msg["text"].strip()
        })


In [11]:
# convert to hugging face dataset 
from datasets import Dataset

oa_dataset = Dataset.from_list(alpaca_style)

print(oa_dataset)


Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 27904
})


In [12]:
keywords = [
    "data science",
    "machine learning",
    "deep learning",
    "artificial intelligence",
    "neural network",
    "statistics",
    "probability",
    "python",
    "pandas",
    "numpy",
    "sql",
    "nlp",
    "computer vision",
    "regression",
    "classification",
    "clustering",
    "model",
    "overfitting",
    "underfitting",
    "gradient descent",
    "interview",
    "data analysis",
    "feature engineering",
    "data preprocessing",
    "data visualization",
    "decision tree",
    "random forest",
    "xgboost",
    "transformer",
    "llm",
    "mlops",
    "hyperparameter"
]

filtering rows for data science domain 


In [14]:
def filter_ds_related(example):
    text = example["instruction"].lower()
    return any(keyword in text for keyword in keywords)

oa_filtered = oa_dataset.filter(filter_ds_related)

print("Rows after DS filtering:", len(oa_filtered))


Filter:   0%|          | 0/27904 [00:00<?, ? examples/s]

Rows after DS filtering: 1165


In [15]:
## basic cleaning - remove extra space and normalize text 

import re

def basic_clean(example):
    def normalize(text):
        text = text.strip()
        text = re.sub(r"\s+", " ", text)   # remove multiple spaces
        return text

    return {
        "instruction": normalize(example["instruction"]),
        "input": "",
        "output": normalize(example["output"])
    }

oa_cleaned = oa_filtered.map(basic_clean)


Map:   0%|          | 0/1165 [00:00<?, ? examples/s]

In [16]:
## remove very short or use less rows

def length_filter(example):
    return len(example["instruction"]) > 15 and len(example["output"]) > 30

oa_cleaned = oa_cleaned.filter(length_filter)

print("After length filtering:", len(oa_cleaned))


Filter:   0%|          | 0/1165 [00:00<?, ? examples/s]

After length filtering: 1139


In [17]:
## remove Non- Interview Style Content 

bad_patterns = ["joke", "story", "poem", "translate", "email", "lyrics"]

def remove_noise(example):
    text = example["instruction"].lower()
    return not any(p in text for p in bad_patterns)

oa_cleaned = oa_cleaned.filter(remove_noise)

print("After noise removal:", len(oa_cleaned))


Filter:   0%|          | 0/1139 [00:00<?, ? examples/s]

After noise removal: 1108


In [18]:
## Create Final training prompt feild 

def create_prompt(example):
    text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}"""
    return {"text": text}

oa_final = oa_cleaned.map(create_prompt)


Map:   0%|          | 0/1108 [00:00<?, ? examples/s]

In [19]:
print("Final cleaned OpenAssistant rows:", len(oa_final))
print(oa_final[0])


Final cleaned OpenAssistant rows: 1108
{'instruction': 'Can you explain contrastive learning in machine learning in simple terms for someone new to the field of ML?', 'input': '', 'output': 'Sure! Let\'s say you want to build a model which can distinguish between images of cats and dogs. You gather your dataset, consisting of many cat and dog pictures. Then you put them through a neural net of your choice, which produces some representation for each image, a sequence of numbers like [0.123, 0.045, 0.334, ...]. The problem is, if your model is unfamiliar with cat and dog images, these representations will be quite random. At one time a cat and a dog picture could have very similar representations (their numbers would be close to each other), while at others two cat images may be represented far apart. In simple terms, the model wouldn\'t be able to tell cats and dogs apart. This is where contrastive learning comes in. The point of contrastive learning is to take pairs of samples (in thi

In [20]:
## saving the cleaned dataset 
oa_final.save_to_disk("openassistant_ds_cleaned")


Saving the dataset (0/1 shards):   0%|          | 0/1108 [00:00<?, ? examples/s]