In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install wandb nltk
!pip install tranformers datasets

In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import os
import wandb

2024-08-04 07:39:52.141958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 07:39:52.142091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 07:39:52.269688: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def print_gpu_info():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert bytes to GB
        print(f"GPU: {gpu_name}")
        print(f"Total GPU Memory: {gpu_memory:.2f} GB")
        print(f"CUDA Version: {torch.version.cuda}")
    else:
        print("No GPU available. Using CPU.")
    

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import os
import wandb
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')

wandb.login(key="c179a7d6cf40b2eacec3bf988f78ecf522e70c6c")

def base_model_trainer(model_name, model_path, max_length=128, seed=42):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load SST-2 dataset
    dataset = load_dataset("glue", "sst2")
    dataset = dataset.shuffle(seed=seed)
    
    stop_words = set(stopwords.words('english'))
    def remove_stop_words(text):
        words = word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_text = ' '.join(filtered_words)
        return filtered_text
    
    dataset['train'] = dataset['train'].map(lambda example: {'sentence': remove_stop_words(example['sentence'])})
    
    def benchmark_model(model_name, model_path, dataset):
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        def tokenize_function(examples):
            return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length)
        
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        
        training_args = TrainingArguments(
            output_dir=f"./results/{model_name}_sst2",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=64,
            warmup_steps=600,
            weight_decay=0.01,
            logging_dir=f"./logs/{model_name}_sst2",
            logging_steps=100,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            report_to="wandb"
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["validation"],
        )
        
        trainer.train()
        
        save_path = f"./models/{model_name}_sst2.pth"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(model.state_dict(), save_path)
        print(f"Model saved to {save_path}")
        
        predictions = trainer.predict(tokenized_dataset["validation"])
        preds = predictions.predictions.argmax(-1)
        labels = predictions.label_ids
        
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average='weighted')
        
        return accuracy, f1, model, tokenizer
    
    print(f"Benchmarking {model_name} on SST-2")
    accuracy, f1, trained_model, tokenizer = benchmark_model(model_name, model_path, dataset)
    
    results = {
        f"{model_name}_sst2": {"accuracy": accuracy, "f1": f1}
    }
    for key, value in results.items():
        print(f"{key}: Accuracy = {value['accuracy']:.4f}, F1 = {value['f1']:.4f}")
    
    return trained_model, tokenizer, results

In [None]:
trained_model, tokenizer, results = base_model_trainer("distilbert", "distilbert/distilbert-base-uncased")

### Infernce import torch



In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def load_model_for_inference(model_name, model_path, saved_model_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    model.load_state_dict(torch.load(saved_model_path))
    
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    return model, tokenizer

def inference(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class = outputs.logits.argmax().item()
    
    return predicted_class

model_name = "distilbert"
model_path = "distilbert/distilbert-base-uncased"
saved_model_path = "/kaggle/input/sst2-distilbert/pytorch/sst2-distilbert/1/distilbert_sst2.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, tokenizer = load_model_for_inference(model_name, model_path, saved_model_path)
model = model.to(device)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset
import random as random

In [5]:
import random
from datasets import load_dataset

def checker(custom_text, label, result):
    prediction = 'Positive' if result == 1 else 'Negative'
    print(f"\nSentence: '{custom_text}'")
    print(f"\nPrediction: {prediction}")
    print(f"\nCorrect Label is: {label}")

def infer(model, tokenizer, device, data,custom_lines=True, seed=42):
    if custom_lines:
        inferencing = True
        while inferencing:
            custom_text = input("\nEnter Text to Analyze Sentiment (or 'quit' to exit): ")
            if custom_text.lower() == "quit":
                inferencing = False
                break
            result = inference(model, tokenizer, custom_text, device)
            checker(custom_text, "N/A", result)  # We don't have a correct label for custom input
    
    else:
        # Load SST-2 dataset
        dataset = load_dataset(f"{data}")
        dataset = dataset.shuffle(seed=seed)
        
        i = random.randint(0, len(dataset['validation']) - 1)
        custom_text = dataset['validation'][i]['sentence']
        label = dataset['validation'][i]['label']
        label_text = 'Positive' if label == 1 else 'Negative'
    
        result = inference(model, tokenizer, custom_text, device)
        checker(custom_text, label_text, result)

In [6]:
infer(model, tokenizer, device,"sst2",custom_lines=True)


Enter Text to Analyze Sentiment (or 'quit' to exit):  This movie is terribly funny, a good watch



Sentence: 'This movie is terribly funny, a good watch'

Prediction: Positive

Correct Label is: N/A



Enter Text to Analyze Sentiment (or 'quit' to exit):  This is a terrible movie



Sentence: 'This is a terrible movie'

Prediction: Negative

Correct Label is: N/A



Enter Text to Analyze Sentiment (or 'quit' to exit):  quit
