In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import logging
import pandas as pd
import transformers as tr
from datasets import load_dataset
import sys 

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Create a stream handler and set it to display log messages on the console
console_handler = logging.StreamHandler(sys.stdout)
logging.getLogger().addHandler(console_handler)



def setup_environment():
    """
    Sets up the environment by installing necessary packages and libraries.
    """
    try:
        # Update package lists and fix broken dependencies
        logging.info("Updating package lists and fixing broken dependencies...")
        os.system('sudo apt update && sudo apt-get update > /dev/null 2>&1 && sudo apt --fix-broken install > /dev/null 2>&1')
        
        # Create directory for CUDA libraries
        logging.info("Creating directory for CUDA libraries...")
        os.system('sudo mkdir -p /tmp/externals/cuda > /dev/null 2>&1')

        # Download and install CUDA libraries
        logging.info("Downloading and installing CUDA libraries...")
        cuda_libs = [
            'libcurand-dev-11-7_10.2.10.50-1_amd64.deb',
            'libcusparse-dev-11-7_11.7.3.50-1_amd64.deb',
            'libcublas-dev-11-7_11.10.1.25-1_amd64.deb',
            'libcusolver-dev-11-7_11.4.0.1-1_amd64.deb'
        ]
        for lib in cuda_libs:
            os.system(f'sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/{lib} \
            -P /tmp/externals/cuda > /dev/null 2>&1')
            os.system(f'sudo dpkg -i /tmp/externals/cuda/{lib} > /dev/null 2>&1')

        # Fix broken dependencies
        logging.info("Fixing any broken dependencies...")
        os.system('sudo apt --fix-broken install -y')

        # Install Python libraries
        logging.info("Installing Python libraries...")
        os.system('pip install torch transformers datasets deepspeed==0.9.3 py-cpuinfo==9.0.0 tensorboardX pandas ipywidgets accelerate --quiet')
        
    except Exception as e:
        logging.error(f"An error occurred during environment setup: {e}")

def load_and_tokenize_data(model_checkpoint):
    """
    Loads and tokenizes the IMDB dataset.
    
    Parameters:
        model_checkpoint (str): The model checkpoint name.
        
    Returns:
        tokenized_dataset: The tokenized dataset.
        
    """
    global tokenizer 
    try:
        logging.info("Loading and tokenizing data...")
        # Load dataset and tokenizer
        imdb_ds = load_dataset("imdb")
        tokenizer = tr.AutoTokenizer.from_pretrained(model_checkpoint, cache_dir="/kaggle/temp/datasets")
        
        # Tokenization function
        def to_tokens(tokenizer, label_map):
            def apply(x):
                target_labels = [label_map[y] for y in x["label"]]
                token_res = tokenizer(
                    x["text"],
                    text_target=target_labels,
                    return_tensors="pt",
                    truncation=True,
                    padding=True,
                )
                return {
                    "input_ids": token_res["input_ids"].tolist(),
                    "attention_mask": token_res["attention_mask"].tolist(),
                    "labels": token_res["labels"].tolist(),
                }
            return apply

        imdb_label_lookup = {0: "negative", 1: "positive", -1: "unknown"}
        imdb_to_tokens = to_tokens(tokenizer, imdb_label_lookup)
        return imdb_ds.map(imdb_to_tokens, batched=True, remove_columns=["text", "label"])
        
    except Exception as e:
        logging.error(f"An error occurred during data loading and tokenization: {e}")

def train_model(model_checkpoint, tokenized_dataset, local_training_root):
    """
    Trains the model on the tokenized dataset.
    
    Parameters:
        model_checkpoint (str): The model checkpoint name.
        tokenized_dataset: The tokenized dataset.
        local_training_root (str): The local directory for training.
    """
    try:
        logging.info("Training the model...")
        # Configure training arguments
        training_args = tr.TrainingArguments(
            os.path.join(local_training_root, "t5-trainer"),
            num_train_epochs=1,
            per_device_train_batch_size=16,
            optim="adamw_torch",
            report_to=["tensorboard"],
        )

        # Initialize model and trainer
        model = tr.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, cache_dir="/kaggle/temp/t5-trainer_cache")
        data_collator = tr.DataCollatorWithPadding(tokenizer=tokenizer)
        trainer = tr.Trainer(
            model,
            training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        # Train and save the model
        trainer.train()
        trainer.save_model(os.path.join(local_training_root, "finetuned_t5"))
        
    except Exception as e:
        logging.error(f"An error occurred during model training: {e}")

def predict_and_display(reviews,model_checkpoint):
    """
    Makes predictions on sample reviews and displays the results.
    
    Parameters:
        model_checkpoint (str): The model checkpoint name.
        reviews (list): The list of reviews for prediction.
    """
    try:
        logging.info("Making predictions...")
        # Load fine-tuned model and make predictions
        fine_tuned_model = tr.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
        inputs = tokenizer(reviews, return_tensors="pt", truncation=True, padding=True)
        pred = fine_tuned_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        # Decode and display results
        decoded_labels = tokenizer.batch_decode(pred, skip_special_tokens=True)
        pdf = pd.DataFrame({"review": reviews, "classification": decoded_labels})
        display(pdf)
        
    except Exception as e:
        logging.error(f"An error occurred during prediction: {e}")

if __name__ == "__main__":
    # Define constants
    local_training_root = "/kaggle/temp/"
    model_checkpoint = "t5-small"
    
    # Setup environment
    setup_environment()
    
    # Load and tokenize data
    tokenized_dataset = load_and_tokenize_data(model_checkpoint)
    
    # Train model
    train_model(model_checkpoint, tokenized_dataset, local_training_root)
    
    # Sample reviews for prediction
    reviews = [
        "In 'Whimsical Wonders,' the whimsy truly shines through. This enchanting film weaves a tapestry of magic and imagination, making it a delightful experience for viewers of all ages. The characters are endearing, and the plot unfolds like a beautifully illustrated storybook. A must-watch for those seeking a charming escape into a world of wonder.",
        "'Midnight Mystery' is a gripping thriller that keeps you on the edge of your seat from start to finish. The suspense builds relentlessly, and the plot twists are brilliantly executed. The lead actor delivers a mesmerizing performance, making this film an absolute must-see for fans of the genre.",
        "While 'Galactic Odyssey' boasts stunning visual effects and epic space battles, the storyline gets lost amidst the dazzling spectacle. The characters lack depth, and the dialogue feels forced at times. It's a visually impressive journey, but it leaves you wanting more substance in the narrative.",
        "In 'Whispering Shadows,' the cinematography is breathtaking, capturing the haunting beauty of the remote wilderness. The slow-burning mystery keeps you guessing until the very end, and the performances are top-notch. This is a hidden gem that deserves recognition for its atmospheric storytelling."
    ]
    
    # Make predictions and display
    predict_and_display( reviews,os.path.join(local_training_root, "finetuned_t5"))


Updating package lists and fixing broken dependencies...






Get:1 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [5023 B]
Get:2 https://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]
Get:3 https://packages.cloud.google.com/apt google-fast-socket InRelease [5015 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:8 http://packages.cloud.google.com/apt gcsfuse-focal/main amd64 Packages [2851 B]
Get:9 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [518 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:11 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [517 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [44.0 kB]
Get:13 http://security.ubuntu.

W: http://packages.cloud.google.com/apt/dists/gcsfuse-focal/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W: https://packages.cloud.google.com/apt/dists/google-fast-socket/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.


Creating directory for CUDA libraries...
Downloading and installing CUDA libraries...
Fixing any broken dependencies...
Reading package lists...






Building dependency tree...
Reading state information...
Correcting dependencies... Done
The following additional packages will be installed:
  cuda-toolkit-11-7-config-common libcublas-11-7 libcurand-11-7
  libcusolver-11-7 libcusparse-11-7
The following NEW packages will be installed:
  cuda-toolkit-11-7-config-common libcublas-11-7 libcurand-11-7
  libcusolver-11-7 libcusparse-11-7
0 upgraded, 5 newly installed, 0 to remove and 91 not upgraded.
4 not fully installed or removed.
Need to get 400 MB of archives.
After this operation, 1190 MB of additional disk space will be used.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  cuda-toolkit-11-7-config-common 11.7.99-1 [16.2 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  libcublas-11-7 11.10.3.66-1 [210 MB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  libcurand-11-7 10.2.10.91-1 [41.5 MB]
Get:4 https://developer.download.nvidia

dpkg-preconfigure: unable to re-open stdin: No such file or directory


Fetched 400 MB in 6s (66.0 MB/s)
Selecting previously unselected package cuda-toolkit-11-7-config-common.
(Reading database ... 113903 files and directories currently installed.)
Preparing to unpack .../cuda-toolkit-11-7-config-common_11.7.99-1_all.deb ...
Unpacking cuda-toolkit-11-7-config-common (11.7.99-1) ...
Selecting previously unselected package libcublas-11-7.
Preparing to unpack .../libcublas-11-7_11.10.3.66-1_amd64.deb ...
Unpacking libcublas-11-7 (11.10.3.66-1) ...
Selecting previously unselected package libcurand-11-7.
Preparing to unpack .../libcurand-11-7_10.2.10.91-1_amd64.deb ...
Unpacking libcurand-11-7 (10.2.10.91-1) ...
Selecting previously unselected package libcusolver-11-7.
Preparing to unpack .../libcusolver-11-7_11.4.0.1-1_amd64.deb ...
Unpacking libcusolver-11-7 (11.4.0.1-1) ...
Selecting previously unselected package libcusparse-11-7.
Preparing to unpack .../libcusparse-11-7_11.7.4.91-1_amd64.deb ...
Unpacking libcusparse-11-7 (11.7.4.91-1) ...
Setting up cuda

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Training the model...




Setting ds_accelerator to cuda (auto detect)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6148
1000,0.1371
1500,0.1271


Making predictions...




Unnamed: 0,review,classification
0,"In 'Whimsical Wonders,' the whimsy truly shine...",positive
1,'Midnight Mystery' is a gripping thriller that...,positive
2,While 'Galactic Odyssey' boasts stunning visua...,negative
3,"In 'Whispering Shadows,' the cinematography is...",positive
