In [11]:
from typing import Dict

from datasets import load_dataset, Dataset

import pandas as pd
from transformers import AutoTokenizer
import seaborn as sns

In [None]:
datasets: Dict[str, Dataset] = {}

datasets['ds_imdb'] = load_dataset("imdb")
datasets['sst'] = load_dataset("sst")
datasets['tweet'] = load_dataset("tweet_eval", "sentiment")
datasets['rotten'] = load_dataset("rotten_tomatoes")
datasets['amazon'] = load_dataset("amazon_reviews_multi", "en")   # cannot use for money ever
datasets['yelp'] = load_dataset("yelp_review_full") # cannot use for money ever
datasets['financ'] = load_dataset("financial_phrasebank", "sentences_75agree")  # Must ask for commercial license
datasets['amzpol'] = load_dataset("amazon_polarity")
datasets['movrat'] = load_dataset("movie_rationales")
datasets['multisen'] = load_dataset("tyqiangz/multilingual-sentiments", "english")  # Check that it's different from the other amazon free one
datasets['cyrpol'] = load_dataset("CyranoB/polarity")

Found cached dataset imdb (/home/alberto/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

No config specified, defaulting to: sst/default
Found cached dataset sst (/home/alberto/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

I want to see what the datasets are composed of: max, min, mean and median length of texts for each dataset (in terms of tokens)

In [None]:
max_num_tokens = 512
tokenizer_xlm = AutoTokenizer.from_pretrained("xlm-roberta-base", model_max_length=max_num_tokens, truncation=True)
tokenizer_xlnet = AutoTokenizer.from_pretrained("xlnet-large-cased", model_max_length=max_num_tokens, truncation=True)

In [None]:
df = pd.DataFrame.from_dict(datasets['ds_imdb']["train"])

In [None]:

df['tokenized_text_xlm'] = df['text'].apply(tokenizer_xlm)
df['tokenized_text_xlnet'] = df['text'].apply(tokenizer_xlnet)


In [None]:
def get_num_tokens(tokens):
    return len(tokens.encodings[0].ids)

In [None]:
df['number_tokens_xlm'] = df['tokenized_text_xlm'].apply(get_num_tokens)

In [None]:
sns.histplot(data=df, x="number_tokens_xlm", bins=50, kde=True)

In [None]:
stats_token_counts = df.describe()['number_tokens_xlm']
stats_token_counts.loc['median'] = df['number_tokens_xlm'].median()

In [None]:
stats_token_counts