In [None]:
!pip install datasets # install the datasets module
from huggingface_hub import list_datasets

# get a list of all datasets on the Hub
all_datasets = list(list_datasets()) # convert the generator object to a list

print(f"there are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")

In [None]:
!pip install datasets --upgrade
from datasets import load_dataset # import load_dataset from the datasets module
emotions = load_dataset("emotion")

In [None]:
emotions = {
    'train': {'features': ['text', 'label'], 'num_rows': 16000},
    'validation': {'features': ['text', 'label'], 'num_rows': 2000},
    'test': {'features': ['text', 'label'], 'num_rows': 2000}
}

In [None]:
train_ds = emotions["train"]
train_ds


In [None]:
len(train_ds)

In [None]:
train_ds['features'] # Access the value associated with the key 'features'


In [None]:

from datasets import load_dataset

emotions = load_dataset("emotion")

# Print the DatasetDict
print(emotions)

# Access and print the 'train' dataset
train_ds = emotions['train']
print(train_ds)

# Print the features of the 'train' dataset
print(train_ds.features)

# Print the first example from the 'train' dataset
print(train_ds[0])

In [None]:

print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds["text"][:5])

In [None]:
import pandas as pd
emotions.set_format(type="pandas")
df=emotions["train"][:]
df.head()

In [None]:
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)
df["label_name"]=df["label"].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words per Tweet"]=df["text"].str.split().apply(len)
df.boxplot("Words per Tweet",by="label_name",grid=False,showfliers=False,color="black") # Changed "words per tweet" to "Words per Tweet"
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

In [None]:
text="Tokenizing text is a core task of NLP."
tokenized_text=list(text)
print(tokenized_text)


In [None]:
token2idx={ch:idx for idx ,ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
input_ids=[token2idx[token] for token in tokenized_text]
print(input_ids)

In [None]:
categorical_df=pd.DataFrame(
    {"Name":["Bumblebee","Optimus prime","Megatron"],"Label ID":[0,1,2]})
categorical_df

In [None]:
pd.get_dummies(categorical_df["Name"])

In [None]:
import torch
import torch.nn.functional as F
input_ids=torch.tensor(input_ids)
one_hot_encodings=F.one_hot(input_ids,num_classes=len(token2idx))
one_hot_encodings.shape


In [None]:
print(f'Token:{tokenized_text[0]}')
print(f"Tensor index:{input_ids[0]}")
print(f"One-hot:{one_hot_encodings[0]}")

In [None]:
tokenized_text=text.split()
print(tokenized_text)


In [None]:
from transformers import AutoTokenizer
model_ckpt="distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
from transformers import DistilBertTokenizer
distilbert_tokenizer=DistilBertTokenizer.from_pretrained(model_ckpt)

In [None]:
encoded_text=tokenizer(text)
print(encoded_text)

In [None]:
tokens=tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"],padding=True,truncation=True)

In [None]:
print(tokenize(emotions["train"][:2]))

In [None]:
emotions_encoded=emotions.map(tokenize,batched=True,batch_size=None)

In [None]:
print(emotions_encoded["train"].column_names)

In [None]:
from transformers  import AutoModel
model_ckpt="distilbert-base-uncased"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text="this is a text"
inputs=tokenizer(text,return_tensors="pt")
print(f"Input tensor shape:{inputs['input_ids'].size()}")


In [None]:
inputs={k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
  outputs=model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()
torch.Size([1,6,768])

In [None]:
outputs.last_hidden_state[:,0].size()


In [None]:
def extract_hidden_states(batch):
  #Place model inputs on the GPU
  inputs={k:v.to(device) for k,v in batch.items()
          if k in tokenizer.model_input_names}
  #Extract last hidden states
  with torch.no_grad():
    last_hidden_state=model(**inputs).last_hidden_state
    #Return vector for [CLS] token
  return {"hidden_state":last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch",columns=["input_ids","attention_mask","label"])

In [None]:
emotions_hidden=emotions_encoded.map(extract_hidden_states,batched=True)

In [None]:
emotions_hidden["train"].column_names

In [None]:
import numpy as np
X_train=np.array(emotions_hidden["train"]["hidden_state"])
X_valid=np.array(emotions_hidden["validation"]["hidden_state"])
y_train=np.array(emotions_hidden["train"]["label"])
y_valid=np.array(emotions_hidden["validation"]["label"])
X_train.shape,X_valid.shape

In [None]:
!pip install umap-learn #install umap-learn
from umap import UMAP #import UMAP from the correct package
from sklearn.preprocessing import MinMaxScaler

#Scale features to [0,1] range
X_scaled=MinMaxScaler().fit_transform(X_train)
#Initialize and fit UMAP
mapper=UMAP(n_components=2,metric="cosine").fit(X_scaled)
#Create a DataFrame of 2D embeddings
df_emb=pd.DataFrame(mapper.embedding_,columns=["X","Y"])
df_emb["label"]=y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt

# Assuming emotions_hidden is a datasets.DatasetDict
fig, axes = plt.subplots(2, 3, figsize=(7, 5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]

# Access labels from emotions_hidden
labels =emotions["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap, gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
X_train=np.array(emotions_hidden["train"]["hidden_state"])
X_valid=np.array(emotions_hidden["validation"]["hidden_state"])
y_train=np.array(emotions_hidden["train"]["label"])
y_valid=np.array(emotions_hidden["validation"]["label"])
X_train.shape,X_valid.shape

from sklearn.linear_model import LogisticRegression
#we increase 'max_iter' to guarantee convergence
lr_clf=LogisticRegression(max_iter=3000)
lr_clf.fit(X_train,y_train)
lr_clf.score(X_valid,y_valid)

In [5]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

NameError: name 'X_train' is not defined

In [6]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
 cm = confusion_matrix(y_true, y_preds, normalize="true")
 fig, ax = plt.subplots(figsize=(6, 6))
 disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
 disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
 plt.title("Normalized confusion matrix")
 plt.show()
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

NameError: name 'lr_clf' is not defined