# DATASET PREPARATION

In [17]:
from datasets import load_dataset,DatasetDict
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

HF_TOKEN = os.getenv("HF_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

### Evaluations and Dataset
List of Dataset:






### 1. **[LexGLUE SCOTUS](https://huggingface.co/datasets/coastalcph/lex_glue/viewer/scotus)**

In [1]:
dataset = load_dataset("coastalcph/lex_glue","scotus")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
})

### 2. **[LexGLUE Ledgar](https://huggingface.co/datasets/coastalcph/lex_glue/viewer/ledgar)**

In [149]:
dataset = load_dataset("coastalcph/lex_glue","ledgar")

Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [150]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

### 3. **[Patent Classification](https://huggingface.co/datasets/ccdv/patent-classification/viewer/abstract)**

In [6]:
dataset = load_dataset("ccdv/patent-classification","abstract")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

### 4. **[Twitter Disaster Tweets](https://huggingface.co/datasets/mehdiiraqui/twitter_disaster)**

In [18]:
dataset = load_dataset("mehdiiraqui/twitter_disaster",split="train+test")

In [19]:
dataset = dataset.rename_column("target","label")
dataset = dataset.remove_columns(['id','keyword','location'])
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 10876
})

In [20]:
temp = dataset.train_test_split(test_size=0.2)
val_test = temp['test'].train_test_split(test_size=0.5)
dataset_dict = DatasetDict({
    'train': temp['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8700
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1088
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1088
    })
})

In [21]:
dataset_dict.push_to_hub("MAdAiLab/twitter_disaster")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/669 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MAdAiLab/twitter_disaster/commit/b83c0f112d9a75490d9e21f99ac63fe301ecd228', commit_message='Upload dataset', commit_description='', oid='b83c0f112d9a75490d9e21f99ac63fe301ecd228', pr_url=None, pr_revision=None, pr_num=None)

### 5. Amazon reviews product

In [22]:
# dataset = load_dataset("AmazonScience/massive", "en-US")
dataset = load_dataset("yyu/amazon-attrprompt", split="train+validation+test")

In [23]:
dataset = dataset.rename_column("_id","label")
dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 15180
})

In [24]:
temp = dataset.train_test_split(test_size=0.2)
temp

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 12144
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 3036
    })
})

In [25]:
val_test = temp['test'].train_test_split(test_size=0.5)
val_test

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1518
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1518
    })
})

In [26]:
dataset_dict = DatasetDict({
    'train': temp['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 12144
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1518
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1518
    })
})

In [27]:
dataset_dict.push_to_hub("MAdAiLab/amazon-attrprompt")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/539 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MAdAiLab/amazon-attrprompt/commit/1eb259ad9eaae624dd67fc93879220d6e6223b12', commit_message='Upload dataset', commit_description='', oid='1eb259ad9eaae624dd67fc93879220d6e6223b12', pr_url=None, pr_revision=None, pr_num=None)