In [None]:
from transformers import pipeline
import pandas as pd

In [None]:
clf=pipeline('text-classification')

In [None]:
text="i dont know how to operate hugging face "

output=clf(text)

output

In [None]:
bot = pipeline("question-answering")

question="what do i dont know about hugging face?"

outputs=bot(question=question,context=text)

pd.DataFrame([outputs])

In [None]:
text="Tokenization is easier on hugging face library"

tokenized=list(text)
print(tokenized)


In [None]:
token2id={ch:idx for idx,ch in enumerate(sorted(set(tokenized)))}
print(token2id)


In [None]:
input_id=[token2id[token] for token in tokenized]
print(input_id)

In [None]:
categorical_df=pd.DataFrame({"Name":["mumbai","Delhi","paris"], "labels":[0,1,2]})
print(categorical_df)

In [None]:
pd.get_dummies(categorical_df['Name'])

In [None]:
import torch


In [None]:

input_id=torch.tensor(input_id)

one_hot_id=torch.nn.functional.one_hot(input_id,num_classes=len(token2id))
print(one_hot_id.size())

In [None]:
print("token:", tokenized[0])
print("token id:", input_id[0])
print("one hot:", one_hot_id[0])


In [None]:
from transformers import AutoTokenizer

model_ckpt="distilbert-base-uncased"

tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
encoded_text=tokenizer(text)
encoded_text

In [None]:
tokens=tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
tokens

In [None]:
sentence=tokenizer.convert_tokens_to_string(tokens)
sentence[6:-6]

In [None]:
def tokenize(batch):
 return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
from transformers import AutoModel

In [None]:
model_ckpt='distilbert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model=AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text="we are learing to use pretrained models"

inputs=tokenizer(text,return_tensors='pt')

inputs['input_ids'].size()

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
 outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
def extract_hidden_states(batch):
 # Place model inputs on the GPU
 inputs = {k:v.to(device) for k,v in batch.items()if k in tokenizer.model_input_names}
 # Extract last hidden states
 with torch.no_grad():
  last_hidden_state = model(**inputs).last_hidden_state
 # Return vector for [CLS] token
 return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
from datasets import load_dataset

emotions = load_dataset('emotion')

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded

In [None]:
emotions_encoded.set_format("torch", columns=["input_ids","attention_mask","label"])


In [None]:
emotions_hidden_state=emotions_encoded.map(extract_hidden_states,batched=True)

In [None]:
emotions_hidden_state["train"].column_names

In [None]:
import numpy as np

In [None]:
X_train = np.array(emotions_hidden_state["train"]["hidden_state"])
X_valid = np.array(emotions_hidden_state["validation"]["hidden_state"])
y_train = np.array(emotions_hidden_state["train"]["label"])
y_valid = np.array(emotions_hidden_state["validation"]["label"])
X_train.shape, X_valid.shape
((16000, 768), (2000, 768))

In [None]:
import matplotlib.pyplot as plt


In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

X_scaled = MinMaxScaler().fit_transform(X_train)

mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)

df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names
for i, (label, cmap) in enumerate(zip(labels, cmaps)):
 df_emb_sub = df_emb.query(f"label == {i}")
 axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
 gridsize=20, linewidths=(0,))
 axes[i].set_title(label)
 axes[i].set_xticks([]), axes[i].set_yticks([])
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)


In [None]:
from transformers import AutoModelForSequenceClassification

num_labels=6
model=AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=num_labels).to(device)

Defining performance metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
 num_train_epochs=2,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
eval_strategy="epoch",
 disable_tqdm=False,
 logging_steps=logging_steps,
 push_to_hub=True,
log_level="error")

In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=emotions_encoded["train"], eval_dataset=emotions_encoded["validation"], tokenizer=tokenizer)
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
from transformers import pipeline
model_id= "Aditya161205/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

In [None]:
custom= "i didnt expected this from the movie"
preds= classifier(custom, return_all_scores=True)
preds

In [None]:
import matplotlib.pyplot as plt

In [None]:
df=pd.DataFrame(preds[0])
df['label'] = df['label'].apply(lambda x: labels[int(x.split('_')[1])])
df

In [None]:
df=pd.DataFrame(preds[0])
plt.bar(labels,100*df["score"])
plt.show()