### load Data
- airline_sentiment daset

In [1]:
# Load the dataset in Jupyter notebook:
import pandas as pd

data_tidy = pd.read_csv('tidy_1000_tweets.csv') # Load the dataset

data_tidy.head() # Display the first few rows

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,cleaned_text
0,0,5.703061e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),virginamerica dhepburn say
1,1,5.703011e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),virginamerica plus add commercial experience t...
2,2,5.703011e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),virginamerica not today must mean need take an...
3,3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),virginamerica really aggressive blast obnoxiou...
4,4,5.703008e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),virginamerica really big bad thing


In [2]:
data=data_tidy[["airline_sentiment","cleaned_text"]]
data.head()

Unnamed: 0,airline_sentiment,cleaned_text
0,neutral,virginamerica dhepburn say
1,positive,virginamerica plus add commercial experience t...
2,neutral,virginamerica not today must mean need take an...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


In [3]:
# Data preperation
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [4]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length",
                              max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }

In [5]:
# prepare tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased'
# device = "cuda"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3).to(device)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### pre-processing

In [6]:
# imdb has two columns review, sentiment(positive, negative)
# tweets we have cleand_text is review, airline_sentiment(negative, neutral, positive)
X = data['cleaned_text'].tolist()

label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

y = data['airline_sentiment'].map(label2id).tolist()

dataset = CustomDataset(X, y, tokenizer)

In [7]:
dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

### Training

In [8]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

### 2nd way for args and merics

In [None]:
for name, param in model.named_parameters():
    print(f"{name} - {'Training' if param.requires_grad else 'Frozen'}")

In [10]:
# create data collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
def compute_metrics(eval_pred):
    load_acc = load_metric('accuracy')
    load_f1 = load_metric('f1')
    logits,labels = eval_pred
    predictions = np.argmax(logits,axis = -1)
    acc = load_acc.compute(predictions = predictions,references = labels)['accuracy']
    f1 = load_f1.compute(predictions = predictions, references = labels)['f1']
    return {'acc':acc,'f1':f1}

In [16]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir = './output/',
    learning_rate=2e-5,
    seed=11,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_steps=600,
    save_steps=600,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True
)



In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [19]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=200, training_loss=0.7903514099121094, metrics={'train_runtime': 168.3048, 'train_samples_per_second': 19.013, 'train_steps_per_second': 1.188, 'total_flos': 423903235276800.0, 'train_loss': 0.7903514099121094, 'epoch': 4.0})

In [20]:
model_name = "distilbert_finetuned_setiment2"
trainer.save_model(model_name)

### Model Testing

In [21]:
# sentiment analysis with the pipeline
from transformers import pipeline

# sentiment_pipeline = pipeline("sentiment-analysis")

# data = ['i love you', 'i hate you']
# sentiment_pipeline(data)

In [22]:
id2label

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [23]:
# load model
tok = AutoTokenizer.from_pretrained(model_name)
mod = AutoModelForSequenceClassification.from_pretrained(model_name)

In [24]:
text0 = "hate the airline"
text1 = "love the airline"
text2 = "virginamerica plus add commercial experience tacky"

pipe = pipeline('text-classification', model=mod, tokenizer=tok)
pipe(text2)

Device set to use cuda:0


[{'label': 'LABEL_2', 'score': 0.5791122317314148}]

In [26]:
text0 = "hate the airline"
# text1 = "love the airline"
# text2 = "virginamerica plus add commercial experience tacky"

pipe = pipeline('text-classification', model=mod, tokenizer=tok)
pipe(text0)

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.6335480809211731}]

In [27]:
text1 = "virginamerica dhepburn say"
# text1 = "love the airline"
# text2 = "virginamerica plus add commercial experience tacky"

pipe = pipeline('text-classification', model=mod, tokenizer=tok)
pipe(text1)

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.5199728608131409}]