In [1]:
! pip install datasets transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [2]:
import torch, os, pickle
import numpy                    as np
import pandas                   as pd

from google.colab               import drive
from sklearn.preprocessing      import MultiLabelBinarizer
from datasets                   import load_from_disk

In [3]:
print("Working Directory:", os.getcwd())
Folder_name = 'NLP_class'
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path
folder_path = f"/content/drive/MyDrive/{Folder_name}"

# Change the working directory to a specific path
os.chdir(folder_path)
# Print the updated working directory
print("Updated Working Directory:", os.getcwd())

Working Directory: /content
Mounted at /content/drive
Updated Working Directory: /content/drive/MyDrive/NLP_class


In [7]:
import utils
from   utils  import df_to_DatasetDict

# Dataset

## First Dataset: Research Paper Subjects Dataset

Download the dataset from the following link and save it in "folder_path": https://www.kaggle.com/datasets/shivanandmn/multilabel-classification-dataset/data?select=train.csv

In [8]:
# Create folder to save dataset
dataset_dir = os.path.join(folder_path,'ResearchPaper_dataset')
os.makedirs(dataset_dir, exist_ok=True)

# output dir
out_dir = os.path.join(folder_path,'ResearchPaper_results')
os.makedirs(os.path.join(folder_path,'ResearchPaper_results'), exist_ok=True)

In [9]:
data_path = f"/content/drive/MyDrive/{Folder_name}/train.csv"
df = pd.read_csv(data_path)
# combining 'title' and 'abstract' column to| get more context
df['text'] = df['TITLE'] + "."+df['ABSTRACT']
# dropping useless features/columns
df.drop(labels=['TITLE', 'ABSTRACT', 'ID'], axis=1, inplace=True)
# rearranging columns
df = df[['text', 'Computer Science', 'Physics', 'Mathematics', 'Statistics',
                     'Quantitative Biology', 'Quantitative Finance',]]


# Convert the subject columns to a list of labels for each row
df['label_names'] = df.apply(lambda row: [col for col in df.columns[1:] if row[col] == 1], axis=1)

multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df['label_names']).astype('float32')
df['labels'] = list(labels)

# Remove redundant columns
label_columns = ["label_names","Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]
df.drop(columns=label_columns, inplace=True)

# # Now df contains the "text" column and the "labels" column with lists of 1s and 0s
df.head()


Unnamed: 0,text,labels
0,Reconstructing Subject-Specific Effect Maps. ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,Rotation Invariance Neural Network. Rotation ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,Spherical polyharmonics and Poisson kernels fo...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
3,A finite element approximation for the stochas...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
4,Comparative study of Discrete Wavelet Transfor...,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0]"


In [10]:
train_size = 0.8
val_size   = 0.2
test_size = None
dataset_dir = dataset_dir
df_to_DatasetDict(df,
                  train_size,
                  val_size,
                  dataset_dir = dataset_dir,
                  frac=1,
                  random_state=200)

Saving the dataset (0/1 shards):   0%|          | 0/16777 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4194 [00:00<?, ? examples/s]

### Save the MultiLabelBinarizer

In [11]:
with open(os.path.join(out_dir,'multi-label-binarizer.pkl'), "wb") as f:
  pickle.dump(multilabel, f)

## Second Dataset: Movie Genres Dataset

The dataset is available here: https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Movie-Data.csv

In [12]:
# Create folder to save dataset
dataset_dir = os.path.join(folder_path,'MovieGenres_dataset')
os.makedirs(dataset_dir, exist_ok=True)

# output dir
out_dir = os.path.join(folder_path,'MovieGenres_results')
os.makedirs(os.path.join(folder_path,'MovieGenres_results'), exist_ok=True)

In [13]:
data_path = "https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Movie-Data.csv"
usecols=["Description", "Genre"]
df = pd.read_csv(data_path, usecols=usecols)
# df.head()

multilabel = MultiLabelBinarizer()

df['Genre'] = df['Genre'].str.split(',')
labels = multilabel.fit_transform(df['Genre']).astype('float32')
df['labels'] = list(labels)

# Drop 'Genre' column
df.drop(columns=['Genre'], inplace=True)

# Change the name of 'Description' column
df.rename(columns={'Description': 'text'}, inplace=True)  # Change column 'A' to 'New_A'

df.head()

Unnamed: 0,text,labels
0,A group of intergalactic criminals are forced ...,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"Following clues to the origin of mankind, a te...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Three girls are kidnapped by a man with a diag...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"In a city of humanoid animals, a hustling thea...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ..."
4,A secret government agency recruits some of th...,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [14]:
train_size = 0.7
val_size   = 0.2
test_size  = 0.1
dataset_dir = dataset_dir
df_to_DatasetDict(df,
                  train_size,
                  val_size,
                  test_size,
                  dataset_dir = dataset_dir)

dataset = load_from_disk(os.path.join(dataset_dir))
dataset

Saving the dataset (0/1 shards):   0%|          | 0/700 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'idx'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['text', 'labels', 'idx'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'labels', 'idx'],
        num_rows: 100
    })
})

### Save the MultiLabelBinarizer

In [15]:
with open(os.path.join(out_dir,'multi-label-binarizer.pkl'), "wb") as f:
  pickle.dump(multilabel, f)