## Combining Multiple Datasets

In [None]:
from datasets import load_dataset
import pandas as pd

# Load individual datasets
alpaca_dataset = load_dataset("tatsu-lab/alpaca")
chat_instruction_prompt_dataset = load_dataset("alespalla/chatbot_instruction_prompts")
dolly_databricks_dataset = load_dataset("databricks/databricks-dolly-15k")
long_form_questions_dataset = load_dataset("akoksal/LongForm")

In [None]:
# Convert individual splits to Pandas DataFrames
df_alpaca = alpaca_dataset["train"].to_pandas()
df_chat_instruction_prompt_train = chat_instruction_prompt_dataset["train"].to_pandas()
df_chat_instruction_prompt_test = chat_instruction_prompt_dataset["test"].to_pandas()
df_dolly_databricks = dolly_databricks_dataset["train"].to_pandas()
df_long_form_train = long_form_questions_dataset["train"].to_pandas()
df_long_form_test = long_form_questions_dataset["test"].to_pandas()
df_long_form_validation = long_form_questions_dataset["validation"].to_pandas()

# Rename columns to "instruction" and "output" for consistency
df_alpaca = df_alpaca.rename(columns={"output": "output"})
df_chat_instruction_prompt_train = df_chat_instruction_prompt_train.rename(columns={"prompt": "instruction", "response": "output"})
df_chat_instruction_prompt_test = df_chat_instruction_prompt_test.rename(columns={"prompt": "instruction", "response": "output"})
df_dolly_databricks = df_dolly_databricks.rename(columns={"instruction": "instruction", "response": "output", "context": "context"})
df_long_form_train = df_long_form_train.rename(columns={"input": "instruction", "output": "output"})
df_long_form_test = df_long_form_test.rename(columns={"input": "instruction", "output": "output"})
df_long_form_validation = df_long_form_validation.rename(columns={"input": "instruction", "output": "output"})



In [None]:
# Add input or context columns to the end of each instruction
def add_input_context(row):
    if 'input' in row and pd.notna(row['input']) and row['input'] != "":
        return row['instruction'] + "    " + "here is the input " + row['input']
    elif 'context' in row and pd.notna(row['context']) and row['context'] != "":
        return row['instruction'] + "    " + "here is the input " + row['context']
    else:
        return row['instruction']

df_alpaca['instruction'] = df_alpaca.apply(add_input_context, axis=1)
df_dolly_databricks['instruction'] = df_dolly_databricks.apply(add_input_context, axis=1)



In [None]:
# Save individual DataFrames to CSV files and describe features
df_alpaca.to_csv("alpaca_dataset.csv", index=False)
print("Saved alpaca_dataset.csv. Features:", df_alpaca.columns)

df_chat_instruction_prompt_train.to_csv("chat_instruction_prompt_train.csv", index=False)
print("Saved chat_instruction_prompt_train.csv. Features:", df_chat_instruction_prompt_train.columns)

df_chat_instruction_prompt_test.to_csv("chat_instruction_prompt_test.csv", index=False)
print("Saved chat_instruction_prompt_test.csv. Features:", df_chat_instruction_prompt_test.columns)

df_dolly_databricks.to_csv("dolly_databricks_dataset.csv", index=False)
print("Saved dolly_databricks_dataset.csv. Features:", df_dolly_databricks.columns)

df_long_form_train.to_csv("long_form_train_dataset.csv", index=False)
print("Saved long_form_train_dataset.csv. Features:", df_long_form_train.columns)

df_long_form_test.to_csv("long_form_test_dataset.csv", index=False)
print("Saved long_form_test_dataset.csv. Features:", df_long_form_test.columns)

df_long_form_validation.to_csv("long_form_validation_dataset.csv", index=False)
print("Saved long_form_validation_dataset.csv. Features:", df_long_form_validation.columns)

In [None]:
# Ensure each DataFrame has a unique index before concatenating
df_alpaca.reset_index(drop=True, inplace=True)
df_chat_instruction_prompt_train.reset_index(drop=True, inplace=True)
df_chat_instruction_prompt_test.reset_index(drop=True, inplace=True)
df_dolly_databricks.reset_index(drop=True, inplace=True)
df_long_form_train.reset_index(drop=True, inplace=True)
df_long_form_test.reset_index(drop=True, inplace=True)
df_long_form_validation.reset_index(drop=True, inplace=True)



In [None]:
# Concatenate all DataFrames into a single DataFrame
df_combined = pd.concat([df_alpaca, df_chat_instruction_prompt_train, 
                        df_chat_instruction_prompt_test, df_dolly_databricks, df_long_form_train,
                        df_long_form_test, df_long_form_validation], ignore_index=True)

In [None]:
# Save the combined DataFrame to a CSV file
df_combined.to_csv("combined_dataset.csv", index=False)

# Print a message indicating that the CSV file has been saved
print("Combined CSV file saved successfully.")

In [None]:
df_combined.columns

In [None]:
# Keep only the "instruction" and "output" columns in df_combined
df_combined_filtered = df_combined[['instruction', 'output']]

# Save the filtered DataFrame to a new CSV file
df_combined_filtered.to_csv("combined_dataset_filtered.csv", index=False)

# Print a message indicating that the filtered CSV file has been saved
print("Filtered CSV file saved successfully.")

In [None]:
print(df_combined_filtered.columns)

In [None]:
final_dataset = load_dataset('csv', data_files='./combined_dataset_filtered.csv' , split='train')

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset
notebook_login()

In [None]:
final_dataset.push_to_hub("CognitiveLab/English_Instruction_Combined")

In [None]:
final_dataset