In [1]:
# Step 0: Library Installation

!pip install transformers>=4.42.0 datasets>=2.10.0 torch>=1.9.0 seqeval --quiet
!pip install peft>=0.11.1 --quiet
!pip install accelerate>=0.28.0 --quiet

print("Libraries installation complete.")

# Verify key versions after installation
print("\n--- Verifying Versions ---")
try:
    import transformers
    print(f"Transformers version: {transformers.__version__}")
except ImportError: print("Transformers not found.")
try:
    import datasets
    print(f"Datasets version: {datasets.__version__}")
except ImportError: print("Datasets not found.")
try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
except ImportError: print("PyTorch not found.")
try:
    import peft
    print(f"PEFT version: {peft.__version__}")
except ImportError: print("PEFT not found.")
try:
    import accelerate
    print(f"Accelerate version: {accelerate.__version__}")
except ImportError: print("Accelerate not found.")
try:
    import pandas as pd
    print(f"Pandas version: {pd.__version__}")
except ImportError: print("Pandas not found.")
try:
    import numpy as np
    print(f"Numpy version: {np.__version__}")
except ImportError: print("Numpy not found.")
print("--- Verification Complete ---")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
bigframes 1.36.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
pylibcugraph-cu12 24.12.0 requires pylibraft-cu12==24.12.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 24.12.0 requires rmm-cu12==24.12.*, but you have rmm-cu12 25.2.0 which is incompatible.[0m[31m
[0mLibraries installation complete.

--- Verifying Versions ---
Transformers version: 4.51.1
Datasets version: 3.5.0
PyTorch version: 2.5.1+cu124


2025-05-13 05:58:47.357472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747115927.544858      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747115927.602702      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


PEFT version: 0.14.0
Accelerate version: 1.3.0
Pandas version: 2.2.3
Numpy version: 1.26.4
--- Verification Complete ---


In [2]:
print("--- Step 1: Loading Data using pd.read_csv ---")

import pandas as pd
import numpy as np

# Paths Handling
KAGGLE_INPUT_PATH = "/kaggle/input/sem-eval-absa"
laptop_train_path = f"{KAGGLE_INPUT_PATH}/Laptop_Train_v2.csv"
resto_train_path = f"{KAGGLE_INPUT_PATH}/Restaurants_Train_v2.csv"

try:
    print(f"Attempting to load: {laptop_train_path}")
    df_laptop = pd.read_csv(laptop_train_path, encoding='ISO-8859-1', on_bad_lines='skip')
    print(f"Loaded {len(df_laptop)} laptop records.")

    print(f"\nAttempting to load: {resto_train_path}")
    df_resto = pd.read_csv(resto_train_path, encoding='ISO-8859-1', on_bad_lines='skip')
    print(f"Loaded {len(df_resto)} restaurant records.")

    # Combine the datasets
    print("\nCombining datasets...")
    df_laptop['domain'] = 'laptop'
    df_resto['domain'] = 'restaurant'
    df_combined_train = pd.concat([df_laptop, df_resto], ignore_index=True)

    # Verify successful combination
    print(f"\nTotal combined training instances: {len(df_combined_train)}")
    print("Combined DataFrame Info:")
    df_combined_train.info()
    print("\nCombined DataFrame Head:")
    print(df_combined_train.head())

    print("\n--- Dataset Loading Complete ---")

except FileNotFoundError:
    print(f"Error: Files not found. Please double-check the Kaggle dataset path and filenames.")
    print(f"Expected paths:\n{laptop_train_path}\n{resto_train_path}")
    df_combined_train = None
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    import traceback
    traceback.print_exc()
    df_combined_train = None

# Check if df_combined_train was successfully created
if df_combined_train is not None and not df_combined_train.empty:
    print("\nData loading successful. Ready for Step 2 (Cleaning).")
else:
    print("\nData loading failed. Please review errors above.")

--- Step 1: Loading Data using pd.read_csv ---
Attempting to load: /kaggle/input/sem-eval-absa/Laptop_Train_v2.csv
Loaded 2358 laptop records.

Attempting to load: /kaggle/input/sem-eval-absa/Restaurants_Train_v2.csv
Loaded 3693 restaurant records.

Combining datasets...

Total combined training instances: 6051
Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6051 entries, 0 to 6050
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           6051 non-null   int64 
 1   Sentence     6051 non-null   object
 2   Aspect Term  6051 non-null   object
 3   polarity     6051 non-null   object
 4   from         6051 non-null   int64 
 5   to           6051 non-null   int64 
 6   domain       6051 non-null   object
dtypes: int64(3), object(4)
memory usage: 331.0+ KB

Combined DataFrame Head:
     id                                           Sentence     Aspect Term  \
0  2339  I charge it at night a

In [3]:
print("\n--- Step 2: Initial Data Cleaning, Standardization, and Type Conversion ---")

# A copy to not modify the original data
df_cleaned = df_combined_train.copy()

# Convert all column names to lowercase and replace spaces with underscores (standarization)
print("\nStandardizing column names (lowercase, replace space with underscore)...")
df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_', regex=False)
print("New column names:", df_cleaned.columns.tolist())

# Standardized column names
essential_cols = ['aspect_term', 'polarity', 'from', 'to', 'sentence']

# 1. Inspect types and missing values on combined data
print("\nInfo before cleaning (after standardization):")
df_cleaned.info()

# 2. Handle Missing Values in essential columns
print(f"\nRows before dropping NaNs: {len(df_cleaned)}")
# Check if essential columns exist before trying to drop NaNs based on them
missing_essential = [col for col in essential_cols if col not in df_cleaned.columns]
if missing_essential:
    print(f"Error: Essential columns missing for dropna: {missing_essential}")
else:
    df_cleaned.dropna(subset=essential_cols, inplace=True)
    print(f"Rows after dropping NaNs in essential columns: {len(df_cleaned)}")

# 3. Ensure Correct Numerical Types for 'from' and 'to'
# Check if columns exist first
if 'from' in df_cleaned.columns and 'to' in df_cleaned.columns:
    # Check if they are already numeric before converting
    if pd.api.types.is_numeric_dtype(df_cleaned['from']) and pd.api.types.is_numeric_dtype(df_cleaned['to']):
        print("\nConverting 'from' and 'to' columns to integer type...")
        try:
            # Ensure no NaNs
            df_cleaned['from'] = pd.to_numeric(df_cleaned['from'], errors='coerce').fillna(-1).astype(int)
            df_cleaned['to'] = pd.to_numeric(df_cleaned['to'], errors='coerce').fillna(-1).astype(int)

            print("'from' and 'to' columns successfully converted to int.")
        except Exception as e: # Catch broader exceptions during conversion
            print(f"Error converting 'from'/'to' to int: {e}. Check data.")
    else:
        print("\nWarning: 'from' or 'to' column is not purely numeric. Attempting coercion.")
        try:
             df_cleaned['from'] = pd.to_numeric(df_cleaned['from'], errors='coerce').fillna(-1).astype(int)
             df_cleaned['to'] = pd.to_numeric(df_cleaned['to'], errors='coerce').fillna(-1).astype(int)
             print("Coercion and conversion to int attempted.")
        except Exception as e:
             print(f"Error during coercion/conversion of 'from'/'to' to int: {e}")

else:
    print("\nError: 'from' or 'to' columns not found after standardization!")


# 4. Basic Text Cleaning (Remove leading/trailing whitespace)
# Check if columns exist
if 'sentence' in df_cleaned.columns and 'aspect_term' in df_cleaned.columns:
    print("\nApplying strip() to 'sentence' and 'aspect_term' columns...")
    df_cleaned['sentence'] = df_cleaned['sentence'].astype(str).str.strip()
    df_cleaned['aspect_term'] = df_cleaned['aspect_term'].astype(str).str.strip()
else:
    print("\nError: 'sentence' or 'aspect_term' columns not found for stripping!")


# 5. Inspect Polarity Labels
if 'polarity' in df_cleaned.columns:
    print("\nUnique polarity values found:")
    unique_polarities = df_cleaned['polarity'].unique()
    print(unique_polarities)
    # Define our final label set (important for later) - adjust based on output above
    # We need to map these string labels to integers for the model
    possible_labels = ['positive', 'negative', 'neutral', 'conflict']
    label_map = {label: i for i, label in enumerate(possible_labels)}
else:
    print("\nError: 'polarity' column not found!")

# --- Verify the cleaned DataFrame ---
print("\nCleaned DataFrame Info:")
df_cleaned.info()
print("\nCleaned DataFrame Head (Cleaned):")
print(df_cleaned.head())

# --- End of Cleaning Step ---
print("\n--- Initial Cleaning, Standardization, and Type Conversion Complete ---")



--- Step 2: Initial Data Cleaning, Standardization, and Type Conversion ---

Standardizing column names (lowercase, replace space with underscore)...
New column names: ['id', 'sentence', 'aspect_term', 'polarity', 'from', 'to', 'domain']

Info before cleaning (after standardization):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6051 entries, 0 to 6050
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           6051 non-null   int64 
 1   sentence     6051 non-null   object
 2   aspect_term  6051 non-null   object
 3   polarity     6051 non-null   object
 4   from         6051 non-null   int64 
 5   to           6051 non-null   int64 
 6   domain       6051 non-null   object
dtypes: int64(3), object(4)
memory usage: 331.0+ KB

Rows before dropping NaNs: 6051
Rows after dropping NaNs in essential columns: 6051

Converting 'from' and 'to' columns to integer type...
'from' and 'to' columns successfully converted 

In [4]:
print("\n--- Step 3: Aggregate Aspects per Sentence using Unique ID ---")

# --- Create a globally unique sentence identifier ---
# Combine domain and original ID to create a unique key
# Ensure 'id' is string for concatenation, handle potential non-existence
if 'id' in df_cleaned.columns and 'domain' in df_cleaned.columns:
    print("\nCreating unique sentence identifier (domain_id)...")
    df_cleaned['unique_id'] = df_cleaned['domain'] + '_' + df_cleaned['id'].astype(str)
    print("Unique ID created.")
    # Verify number of unique IDs vs original IDs
    print(f"Unique IDs created: {df_cleaned['unique_id'].nunique()}")
    print(f"Original IDs count: {df_cleaned['id'].nunique()}") # Should be roughly half if IDs repeat across domains
else:
    print("Error: Cannot create unique_id. 'id' or 'domain' column missing.")
    df_cleaned['unique_id'] = None # Avoid error later, but signals problem

# Define the function to apply to each sentence group (same as before)
def aggregate_aspects_per_sentence(group):
    required_cols = ['sentence', 'aspect_term', 'polarity', 'from', 'to', 'domain']
    if not all(col in group.columns for col in required_cols):
         print(f"Warning: Skipping group due to missing columns. Group keys: {group.name}")
         return None

    aspect_list = []
    for _, row in group.iterrows():
        aspect_list.append({
            'term': row['aspect_term'],
            'polarity': row['polarity'],
            'from': row['from'],
            'to': row['to']
        })
    # Take sentence and domain from the first row
    # Keep the original ID as well if needed, though unique_id is the key now
    result = pd.Series({
        'original_id': group['id'].iloc[0], # Keep original ID for reference
        'sentence': group['sentence'].iloc[0],
        'aspects': aspect_list,
        'domain': group['domain'].iloc[0]
    })
    return result

# Group by the NEW 'unique_id' column
print(f"\nGrouping by 'unique_id' and aggregating aspects...")
if 'unique_id' in df_cleaned.columns and df_cleaned['unique_id'].notna().all():
    aggregated_data_series = df_cleaned.groupby('unique_id').apply(aggregate_aspects_per_sentence)
    aggregated_data_series = aggregated_data_series.dropna()
    aggregated_df = aggregated_data_series.reset_index() # unique_id becomes a column
    print(f"Number of unique sentences after aggregation: {len(aggregated_df)}")
else:
    print("Error: 'unique_id' column not found or contains NaNs. Cannot group.")
    aggregated_df = pd.DataFrame()


# Check the result
if not aggregated_df.empty:
    print("\nAggregated DataFrame Head:")
    print(aggregated_df.head()) # Note the new 'unique_id' and 'original_id' columns

    # Find and print the example that had ID 3 from laptops
    print("\nExample for original Laptop ID 3:")
    laptop_example_3 = aggregated_df[(aggregated_df['original_id'] == 3) & (aggregated_df['domain'] == 'laptop')]
    if not laptop_example_3.empty:
         print(laptop_example_3.iloc[0]['sentence'])
         print(laptop_example_3.iloc[0]['aspects'])
    else:
         print("Laptop example with ID 3 not found in aggregated data (might have been dropped if it had NaNs).")


    # --- Convert to Hugging Face Dataset object ---
    print("\nConverting aggregated DataFrame to Hugging Face Dataset...")
    from datasets import Dataset, DatasetDict

    # Rename columns if needed, or select specific ones
    # Let's keep unique_id, sentence, aspects, domain
    columns_to_keep = ['unique_id', 'sentence', 'aspects', 'domain'] # Exclude original_id if not needed for tokenization
    hf_dataset = Dataset.from_pandas(aggregated_df[columns_to_keep])

    print("\nHugging Face Dataset Info:")
    print(hf_dataset)
    print("\nSample record from Hugging Face Dataset:")
    print(hf_dataset[0])

else:
    print("\nAggregated DataFrame is empty, cannot proceed to Dataset conversion.")


# --- End of Aggregation Step ---
print("\n--- Aggregation (Revised) Complete ---")



--- Step 3 (Revised): Aggregate Aspects per Sentence using Unique ID ---

Creating unique sentence identifier (domain_id)...
Unique ID created.
Unique IDs created: 3509
Original IDs count: 2687

Grouping by 'unique_id' and aggregating aspects...
Number of unique sentences after aggregation: 3509

Aggregated DataFrame Head:
     unique_id  original_id  \
0   laptop_100          100   
1  laptop_1001         1001   
2  laptop_1008         1008   
3   laptop_101          101   
4  laptop_1012         1012   

                                            sentence  \
0  I had of course bought a 3 year warranty, so I...   
1  But sadly the replacement froze-up while updat...   
2           Ive had to call tech support many times.   
3  I got assurances from 2 different people that ...   
4                     I had to pay for the shipping!   

                                             aspects  domain  
0  [{'term': '3 year warranty', 'polarity': 'neut...  laptop  
1  [{'term': 'BIOS', 'po

  aggregated_data_series = df_cleaned.groupby('unique_id').apply(aggregate_aspects_per_sentence)


In [5]:
print("\n--- Starting Consolidated Steps 4-7 ---")

import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
try:
    from seqeval.metrics import classification_report, accuracy_score
    SEQEVAL_AVAILABLE = True
    print("seqeval imported successfully.")
except ImportError:
    print("Warning: seqeval library not found. Metrics calculation will be basic.")
    SEQEVAL_AVAILABLE = False
    # Define dummy functions if seqeval not found to avoid NameError
    def classification_report(y_true, y_pred, output_dict=True, zero_division=0): return {}
    def accuracy_score(y_true, y_pred): return 0.0


# --- Step 4: Define Labeling Scheme, Load Tokenizer, Define Alignment ---
print("\n--- Running Step 4 ---")
label_list = ["O", "B-ASP", "I-ASP"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list)
MODEL_NAME = "bert-base-uncased"
print(f"Labels: {label_list}")
print(f"Loading Tokenizer ({MODEL_NAME})...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id) # Define config here
    print("Tokenizer and Config loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer/config: {e}")
    tokenizer = None; config = None

def tokenize_and_align_labels(examples, tkz, lbl2id, label_all_tokens=False):
    """ Implements tokens' BIO tagging """
    tokenized_inputs = tkz(examples["sentence"], truncation=True, is_split_into_words=False, max_length=512, return_offsets_mapping=True)
    all_labels = []
    # Iterates over the known aspects
    for i, offset_mapping in enumerate(tokenized_inputs["offset_mapping"]):
        aspects_in_doc = examples["aspects"][i]
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else lbl2id["O"] for word_id in word_ids]
        # Iterates over all BERT tokens of the sentence for the current aspect
        for aspect in aspects_in_doc:
            asp_start_char = aspect['from']; asp_end_char = aspect['to']
            token_start_index = -1
            for idx, (start, end) in enumerate(offset_mapping):
                if start == end == 0: continue
                if (start < asp_end_char) and (end > asp_start_char):
                    if token_start_index == -1:
                        is_first_word_token = True; current_word_id = word_ids[idx]
                        if current_word_id is not None and idx > 0:
                             prev_word_id = word_ids[idx-1]
                             if prev_word_id == current_word_id: is_first_word_token = False
                        if start >= asp_start_char and is_first_word_token:
                            if label_ids[idx] == lbl2id["O"]: label_ids[idx] = lbl2id["B-ASP"]
                            token_start_index = idx
                        elif label_ids[idx] == lbl2id["O"]:
                            label_ids[idx] = lbl2id["I-ASP"]
                            if token_start_index == -1: token_start_index = idx
                    elif label_ids[idx] == lbl2id["O"]: label_ids[idx] = lbl2id["I-ASP"]
        if not label_all_tokens:
            final_aligned_labels = []; last_word_id = None
            for idx, word_id in enumerate(word_ids):
                if word_id is None: final_aligned_labels.append(-100)
                elif word_id == last_word_id: final_aligned_labels.append(-100)
                else: final_aligned_labels.append(label_ids[idx])
                last_word_id = word_id
            all_labels.append(final_aligned_labels)
        else: all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

print("\nApplying tokenization and label alignment...")
tokenized_dataset = None
if 'hf_dataset' in locals() and hf_dataset and tokenizer:
    try:
        tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tkz': tokenizer, 'lbl2id': label2id, 'label_all_tokens': False}, remove_columns=hf_dataset.column_names)
        print("Tokenization complete.")
    except Exception as e: print(f"Error during .map(): {e}")
else: print("Error: hf_dataset or tokenizer missing.")

print("\nInitializing Data Collator...")
if tokenizer: data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer); print("Data Collator initialized.")
else: data_collator = None; print("Error: Tokenizer missing.")

print("\nSplitting dataset...")
dataset_splits = None
if tokenized_dataset:
    try:
        train_testvalid = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
        test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)
        dataset_splits = DatasetDict({'train': train_testvalid['train'], 'validation': test_valid['train'], 'test': test_valid['test']})
        print("Dataset splits created:", dataset_splits)
    except Exception as e: print(f"Error splitting dataset: {e}")
else: print("Cannot split dataset.")
print("\n--- Step 4 Complete ---")

# --- Step 5: Defining Evaluation Metrics ---
print("\n--- Step 5: Defining Evaluation Metrics ---")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    true_labels = []; true_predictions = []
    # Iterate through each sequence in the batch
    for prediction_seq, label_seq in zip(predictions, labels):
        seq_true_labels = []; seq_true_preds = []
        # Iterate through tokens in the current sequence
        for pred, label in zip(prediction_seq, label_seq):
            if label != -100:
                 if label in id2label and pred in id2label:
                     seq_true_labels.append(id2label[label])
                     seq_true_preds.append(id2label[pred])
        if seq_true_labels:
             true_labels.append(seq_true_labels)
             true_predictions.append(seq_true_preds)
    # If no valid labels found in the batch, return zeros
    if not true_labels or not SEQEVAL_AVAILABLE: return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    results = {}
    try:
        report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
        results["precision"] = report.get("micro avg", {}).get("precision", 0.0)
        results["recall"] = report.get("micro avg", {}).get("recall", 0.0)
        results["f1"] = report.get("micro avg", {}).get("f1-score", 0.0)
        if "ASP" in report: results["f1_ASP"] = report["ASP"].get("f1-score", 0.0)
    except Exception as e: print(f"Error calculating report: {e}"); results = {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    return results
print("compute_metrics function defined.")
print("\n--- Step 5 Complete ---")

# --- Step 6: Configure Training Arguments ---
print("\n--- Step 6: Configure Training Arguments (Simplest Working Version) ---")
args = None
if 'dataset_splits' in locals() and dataset_splits:
    TRAIN_BATCH_SIZE = 16; NUM_EPOCHS = 3; LEARNING_RATE = 2e-5
    OUTPUT_DIR = "/kaggle/working/bert-base-uncased-absa-consolidated"
    args = TrainingArguments(
        output_dir=OUTPUT_DIR, 
        num_train_epochs=NUM_EPOCHS, 
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=TRAIN_BATCH_SIZE, 
        per_device_eval_batch_size=TRAIN_BATCH_SIZE,
        weight_decay=0.01, 
        report_to="none", 
        save_steps=500, 
        logging_steps=100,
        load_best_model_at_end=False
    )
    print("TrainingArguments configured")
else: print("Error: dataset_splits missing. Cannot configure TrainingArguments.")
print("\n--- Step 6 Complete ---")

# --- Step 7: Instantiate the Trainer ---
print("\n--- Step 7: Instantiate the Trainer ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
trainer = None

# Check before Trainer instantiation
final_check_vars = {'model_config': config, 'training_args': args, 'datasets': dataset_splits, 'tkz': tokenizer, 'collator': data_collator, 'metrics_func': compute_metrics}
if all(v is not None for v in final_check_vars.values()):
    try:
        print(f"\nLoading model ({MODEL_NAME}) for Token Classification...")
        model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
        model.to(device)
        print(f"Model loaded and moved to {device}")

        print("\nInstantiating Trainer...")
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=dataset_splits["train"],
            eval_dataset=dataset_splits["validation"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        print("Trainer instantiated successfully.")

    except Exception as e:
        print(f"Error during model loading or Trainer instantiation: {e}")
        import traceback
        traceback.print_exc()
else:
    print("Error: One or more required components were None before Trainer instantiation. Check previous steps.")
    print({k: "Exists" if v is not None else "MISSING/NONE" for k, v in final_check_vars.items()})


print("\n--- Consolidated Steps 4-7 Complete ---")

# Check if trainer was created
if 'trainer' in locals() and trainer is not None:
    print("\n Trainer object created successfully!")
else:
    print("\n Trainer object creation failed. Please review errors above.")


--- Starting Consolidated Steps 4-7 ---
seqeval imported successfully.

--- Running Step 4 ---
Labels: ['O', 'B-ASP', 'I-ASP']
Loading Tokenizer (bert-base-uncased)...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer and Config loaded successfully.

Applying tokenization and label alignment...


Map:   0%|          | 0/3509 [00:00<?, ? examples/s]

Tokenization complete.

Initializing Data Collator...
Data Collator initialized.

Splitting dataset...
Dataset splits created: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 2807
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 351
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 351
    })
})

--- Step 4 Complete ---

--- Step 5: Defining Evaluation Metrics ---
compute_metrics function defined.

--- Step 5 Complete ---

--- Step 6: Configure Training Arguments (Simplest Working Version) ---
TrainingArguments configured (Simplest Version + Save/Log Steps).

--- Step 6 Complete ---

--- Step 7: Instantiate the Trainer ---
Using device: cuda

Loading model (bert-base-uncased) for Token Classificati

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to cuda

Instantiating Trainer...
Trainer instantiated successfully.

--- Consolidated Steps 4-7 Complete ---

 Trainer object created successfully! Ready for Step 8 (Training).


  trainer = Trainer(


In [6]:
print("\n--- Step 8: Start Fine-Tuning ---")

if 'trainer' in locals() and trainer is not None:
    try:
        print("\nStarting training...")
        # Start the training loop
        train_result = trainer.train()
        print("\nTraining finished!")

        # Save final metrics and state
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

        print("\nFinal training metrics logged and saved.")
        print(metrics) # Print metrics

    except Exception as e:
        print(f"\nAn error occurred during training: {e}")
        import traceback
        traceback.print_exc()

else:
    print("Error: Trainer object not found or not instantiated successfully. Cannot start training.")


# --- End of Fine-Tuning Step ---
print("\n--- Fine-Tuning Process Attempted ---")


--- Step 8: Start Fine-Tuning ---

Starting training...


Step,Training Loss
100,0.2739
200,0.1147
300,0.0854
400,0.0737
500,0.053



Training finished!
***** train metrics *****
  epoch                    =        3.0
  total_flos               =   183333GF
  train_loss               =     0.1169
  train_runtime            = 0:00:56.70
  train_samples_per_second =    148.512
  train_steps_per_second   =      9.312

Final training metrics logged and saved.
{'train_runtime': 56.7023, 'train_samples_per_second': 148.512, 'train_steps_per_second': 9.312, 'total_flos': 196852651406082.0, 'train_loss': 0.11687352711504156, 'epoch': 3.0}

--- Fine-Tuning Process Attempted ---


In [7]:
print("\n--- Step 9: Evaluation ---")

if ('trainer' in locals() and trainer is not None and
    'dataset_splits' in locals() and dataset_splits is not None and
    'validation' in dataset_splits):

    print("\nEvaluating on the validation set...")
    try:
        eval_results = trainer.evaluate() # Runs evaluation on dataset_splits['validation']

        # Save validation metrics
        trainer.log_metrics("eval", eval_results)
        trainer.save_metrics("eval", eval_results)

        print("\nValidation Set Evaluation Results:")
        print(eval_results)
        
    except Exception as e:
        print(f"An error occurred during validation set evaluation: {e}")
        import traceback
        traceback.print_exc()

    # --- Evaluate on the Test Set ---
    if 'test' in dataset_splits:
        print("\nEvaluating on the test set...")
        try:
            test_results = trainer.evaluate(eval_dataset=dataset_splits['test'])

            # Save test metrics
            trainer.log_metrics("test", test_results)
            trainer.save_metrics("test", test_results)

            print("\nTest Set Evaluation Results:")
            test_metrics_renamed = {f"test_{k.replace('eval_', '')}": v for k, v in test_results.items()}
            print(test_metrics_renamed)

        except Exception as e:
            print(f"An error occurred during test set evaluation: {e}")
            import traceback
            traceback.print_exc()
    else:
        print("\nTest set not found in dataset_splits. Skipping test set evaluation.")

else:
    print("Error: Trainer object or validation dataset not found. Cannot perform evaluation.")

print("\n--- Evaluation Complete ---")


--- Step 9: Evaluation ---

Evaluating on the validation set...


***** eval metrics *****
  epoch                   =        3.0
  eval_f1                 =     0.8163
  eval_f1_ASP             =     0.8163
  eval_loss               =     0.1131
  eval_precision          =     0.7837
  eval_recall             =     0.8517
  eval_runtime            = 0:00:00.72
  eval_samples_per_second =    482.764
  eval_steps_per_second   =     30.259

Validation Set Evaluation Results:
{'eval_loss': 0.11309386044740677, 'eval_precision': 0.7837423312883436, 'eval_recall': 0.8516666666666667, 'eval_f1': 0.8162939297124601, 'eval_f1_ASP': 0.8162939297124601, 'eval_runtime': 0.7271, 'eval_samples_per_second': 482.764, 'eval_steps_per_second': 30.259, 'epoch': 3.0}

Evaluating on the test set...
***** test metrics *****
  epoch                   =        3.0
  eval_f1                 =     0.8381
  eval_f1_ASP             =     0.8381
  eval_loss               =     0.0952
  eval_precision          =     0.8085
  eval_recall             =     0.8699
  eval_runtime   

In [7]:
print("\n--- Step 10: Saving Model & Inference Pipeline ---")

from transformers import pipeline

# --- 1. Save the Fine-Tuned Model and Tokenizer ---
final_model_output_dir = "/kaggle/working/bert-absa-fine-tuned-final"

print(f"\nSaving the fine-tuned model and tokenizer to: {final_model_output_dir}")

if 'trainer' in locals() and trainer is not None and 'tokenizer' in locals() and tokenizer is not None:
    try:
        trainer.save_model(final_model_output_dir) # Saves model weights and config
        tokenizer.save_pretrained(final_model_output_dir) # Saves tokenizer files
        print("Model and tokenizer saved successfully.")
    except Exception as e:
        print(f"Error saving model/tokenizer: {e}")
else:
    print("Error: Trainer or Tokenizer object not found. Cannot save.")


# --- 2. Create Inference Pipeline ---
# Use the Hugging Face pipeline in token classification inference

print("\nCreating inference pipeline...")

if ('AutoModelForTokenClassification' in locals() or 'transformers' in locals()) and \
   ('AutoTokenizer' in locals() or 'transformers' in locals()) and \
   ('id2label' in locals()):

    try:
        # Load the model and tokenizer we just saved
        loaded_model = AutoModelForTokenClassification.from_pretrained(final_model_output_dir)
        loaded_tokenizer = AutoTokenizer.from_pretrained(final_model_output_dir)

        # Create the token-classification pipeline
        device_id = 0 if torch.cuda.is_available() else -1
        absa_pipeline = pipeline(
            "token-classification",
            model=loaded_model,
            tokenizer=loaded_tokenizer,
            aggregation_strategy="simple",
            device=device_id 
        )
        print("Inference pipeline created successfully.")

        # --- 3. Test Inference ---
        print("\nTesting inference pipeline on example sentences:")
        test_sentences = [
            "The battery life is amazing, but the screen is hard to see.",
            "Great price and camera quality.",
            "Service was slow and the food was just okay."
        ]

        for sentence in test_sentences:
            print(f"\nInput: '{sentence}'")
            try:
                outputs = absa_pipeline(sentence)
                # The pipeline output gives entities directly
                extracted_aspects = [
                    {"term": entity['word'], "score": entity['score']}
                    for entity in outputs if entity['entity_group'] == 'ASP' # Assuming group name matches B-ASP/I-ASP pattern
                ]
                print(f"Extracted Aspects: {extracted_aspects}")
            except Exception as e:
                print(f"Error during pipeline inference: {e}")

    except Exception as e:
        print(f"Error creating or using inference pipeline: {e}")
        absa_pipeline = None

else:
    print("Error: Could not create pipeline due to missing components (Model, Tokenizer, or id2label).")
    absa_pipeline = None

print("\n--- Model Saving and Inference Testing Complete ---")


--- Step 10: Saving Model & Inference Pipeline ---

Saving the fine-tuned model and tokenizer to: /kaggle/working/bert-absa-fine-tuned-final


Device set to use cuda:0


Model and tokenizer saved successfully.

Creating inference pipeline...
Inference pipeline created successfully.

Testing inference pipeline on example sentences:

Input: 'The battery life is amazing, but the screen is hard to see.'
Extracted Aspects: [{'term': 'battery life', 'score': 0.9935806}, {'term': 'screen', 'score': 0.9953557}]

Input: 'Great price and camera quality.'
Extracted Aspects: [{'term': 'price', 'score': 0.9944055}, {'term': 'camera quality', 'score': 0.7932142}]

Input: 'Service was slow and the food was just okay.'
Extracted Aspects: [{'term': 'service', 'score': 0.9977319}, {'term': 'food', 'score': 0.99815565}]

--- Model Saving and Inference Testing Complete ---


In [9]:
# --- Download output files ---
import os

print("\n--- Downloading Output Files ---")

folder_to_download = "bert-absa-fine-tuned-final" # Saved to
archive_name = "bert-absa-fine-tuned-final.tar.gz" # Output archive name
output_path = f"/kaggle/working/{folder_to_download}"

if os.path.exists(output_path):
    print(f"Compressing '{folder_to_download}' into '{archive_name}'...")
    # Use tar command to create a compressed archive
    !tar -czf {archive_name} -C /kaggle/working/ {folder_to_download}
    print(f"Archive '{archive_name}' created in /kaggle/working/. You can download it from the Kaggle sidebar (Data -> Output).")
else:
    print(f"Error: Folder '{output_path}' not found.")


--- Downloading Output Files ---
Compressing 'bert-absa-fine-tuned-final' into 'bert-absa-fine-tuned-final.tar.gz'...
Archive 'bert-absa-fine-tuned-final.tar.gz' created in /kaggle/working/. You can download it from the Kaggle sidebar (Data -> Output).
