In [1]:
!pip install datasets
!pip install num2words
!pip install beautifulsoup4
!pip install contractions

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Installing collected p

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import string
import re
from num2words import num2words
from bs4 import BeautifulSoup
import contractions


# SST2

In [3]:
# Load SST2 dataset and sample
sst2_dataset = load_dataset('glue', 'sst2')


# Data Fields
# idx: Monotonically increasing index ID.
# sentence: Complete sentence expressing an opinion about a film.
# label: Sentiment of the opinion, either "negative" (0) or positive (1). The test set labels are hidden (-1).

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [4]:
sst2_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Handle contractions using the contractions library
    text = contractions.fix(text)

    # Convert numbers to words
    text = re.sub(r'\d+', lambda x: num2words(x.group()), text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text


# Function to create, preprocess, and save a dataset of a specific size
def create_and_save_dataset(dataset, size, file_name, set_type='train'):
    sample = dataset[set_type].shuffle(seed=42).select(range(size))
    df = pd.DataFrame(sample, columns=['sentence', 'label'])

    # Rename 'sentence' column to 'text' and preprocess text
    df = df.rename(columns={'sentence': 'text'})
    df['text'] = df['text'].apply(preprocess_text)

    df.to_csv(file_name, index=False)

# Create and save preprocessed datasets of different sizes
create_and_save_dataset(sst2_dataset, 50, 'sst2_50.csv')
create_and_save_dataset(sst2_dataset, 100, 'sst2_100.csv')
create_and_save_dataset(sst2_dataset, 500, 'sst2_500.csv')
create_and_save_dataset(sst2_dataset, 1000, 'sst2_1000.csv')

# Preprocess and save the entire test or validation set
# Check which one is available (test or validation)
# set_type = 'validation' if 'validation' in sst2_dataset else 'test'
set_type = 'validation'
full_size = len(sst2_dataset[set_type])
create_and_save_dataset(sst2_dataset, full_size, f'sst2_full_test.csv', set_type=set_type)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [6]:
df_sst2_test = pd.read_csv('sst2_full_test.csv')

In [7]:
df_sst2_test

Unnamed: 0,text,label
0,it gets onto the screen just about as much of ...,1
1,my big fat greek wedding uses stereotypes in a...,1
2,for the most part director annesophie birot s...,1
3,cq s reflection of artists and the love of cin...,1
4,charles entertaining film chronicles seinfeld...,1
...,...,...
867,the character of zigzag is not sufficiently de...,0
868,how do you spell cliché,0
869,i ve always dreamed of attending cannes but a...,0
870,nine queens is not only than a frighteningly c...,1


# IMDB

In [13]:
# Load IMDB dataset and sample
imdb_dataset = load_dataset('imdb')


# data fields
# text: a string feature.
# label: a classification label, with possible values including neg (0), pos (1).

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [15]:
imdb_dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [16]:
# Function to create, preprocess, and save a dataset of a specific size
def create_and_save_dataset(dataset, size, file_name, set_type='train'):
    sample = dataset[set_type].shuffle(seed=42).select(range(size))
    df = pd.DataFrame(sample, columns=['text', 'label'])

    df['text'] = df['text'].apply(preprocess_text)
    df.to_csv(file_name, index=False)

# Create and save preprocessed datasets of different sizes
create_and_save_dataset(imdb_dataset, 50, 'imdb_50.csv')
create_and_save_dataset(imdb_dataset, 100, 'imdb_100.csv')
create_and_save_dataset(imdb_dataset, 500, 'imdb_500.csv')
create_and_save_dataset(imdb_dataset, 1000, 'imdb_1000.csv')

# Preprocess and save the entire test or validation set
# Check which one is available (test or validation)
# set_type = 'validation' if 'validation' in sst2_dataset else 'test'
set_type = 'test'
full_size = len(imdb_dataset[set_type])
create_and_save_dataset(imdb_dataset, full_size, f'imdb_full_test.csv', set_type=set_type)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [17]:
create_and_save_dataset(imdb_dataset, 200, f'imdb_200_test.csv', set_type=set_type)

# Yahoo

In [18]:
yahoo_dataset = load_dataset('yahoo_answers_topics')


# classes:
# Society & Culture
# Science & Mathematics
# Health
# Education & Reference
# Computers & Internet
# Sports
# Business & Finance
# Entertainment & Music
# Family & Relationships
# Politics & Government

Downloading data:   0%|          | 0.00/336M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [19]:
yahoo_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})

In [20]:
yahoo_dataset["train"][0]

{'id': 0,
 'topic': 4,
 'question_title': "why doesn't an optical mouse work on a glass table?",
 'question_content': 'or even on some surfaces?',
 'best_answer': 'Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \\nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.'}

In [21]:
# Function to create, preprocess, and save a dataset of a specific size
def create_and_save_dataset(dataset, size, file_name, set_type='train'):
    sample = dataset[set_type].shuffle(seed=42).select(range(size))
    df = pd.DataFrame(sample, columns=['topic', 'best_answer'])

    # Rename 'sentence' column to 'text' and preprocess text
    df = df.rename(columns={'topic': 'label'})

    # Rename 'sentence' column to 'text' and preprocess text
    df = df.rename(columns={'best_answer': 'text'})

    df['text'] = df['text'].apply(preprocess_text)
    df.to_csv(file_name, index=False)

# Create and save preprocessed datasets of different sizes
create_and_save_dataset(yahoo_dataset, 50, 'yahoo_50.csv')
create_and_save_dataset(yahoo_dataset, 100, 'yahoo_100.csv')
create_and_save_dataset(yahoo_dataset, 500, 'yahoo_500.csv')
create_and_save_dataset(yahoo_dataset, 1000, 'yahoo_1000.csv')

# Preprocess and save the entire test or validation set
# Check which one is available (test or validation)
# set_type = 'validation' if 'validation' in sst2_dataset else 'test'
set_type = 'test'
full_size = len(yahoo_dataset[set_type])
create_and_save_dataset(yahoo_dataset, 500, f'yahoo_500_test.csv', set_type=set_type)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [22]:
create_and_save_dataset(yahoo_dataset, 200, f'yahoo_200_test.csv', set_type=set_type)

  soup = BeautifulSoup(text, "html.parser")


# 20 NewsGroups

In [30]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')


# classes:
#  'alt.atheism',
#  'comp.graphics',
#  'comp.os.ms-windows.misc',
#  'comp.sys.ibm.pc.hardware',
#  'comp.sys.mac.hardware',
#  'comp.windows.x',
#  'misc.forsale',
#  'rec.autos',
#  'rec.motorcycles',
#  'rec.sport.baseball',
#  'rec.sport.hockey',
#  'sci.crypt',
#  'sci.electronics',
#  'sci.med',
#  'sci.space',
#  'soc.religion.christian',
#  'talk.politics.guns',
#  'talk.politics.mideast',
#  'talk.politics.misc',
#  'talk.religion.misc'

In [31]:
# Print the available keys
print(list(newsgroups_train.keys()))

# Examples of accessing different parts:
print(newsgroups_train.data[0])       # First text post
print(newsgroups_train.target_names)  # Names of newsgroups
print(newsgroups_train.target[0])     # Target label of the first post

['data', 'filenames', 'target_names', 'target', 'DESCR']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 

In [32]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [33]:
def create_and_save_dataset(size, file_name, subset='train'):
    # Fetch the dataset
    dataset = fetch_20newsgroups(subset=subset)

    # Convert to DataFrame
    df = pd.DataFrame({'text': dataset.data, 'label': dataset.target})

    # Sample the dataset
    df_sampled = df.sample(n=size, random_state=42)

    # Map numerical labels to actual topic names
    df_sampled['label'] = df_sampled['label'].apply(lambda x: dataset.target_names[x])

    # Preprocess text (you need to define the preprocess_text function)
    df_sampled['text'] = df_sampled['text'].apply(preprocess_text)

    # Save to CSV
    df_sampled.to_csv(file_name, index=False)


# Create and save preprocessed datasets of different sizes
create_and_save_dataset(50, 'newsgroups_50.csv')
create_and_save_dataset(100, 'newsgroups_100.csv')
create_and_save_dataset(500, 'newsgroups_500.csv')
create_and_save_dataset(1000, 'newsgroups_1000.csv')

set_type = 'test'
test_dataset = fetch_20newsgroups(subset=set_type)
create_and_save_dataset(500, f'newsgroups_500_test.csv', subset=set_type)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [34]:
create_and_save_dataset(200, f'newsgroups_200_test.csv', subset=set_type)