In [1]:
import pandas as pd
import numpy as np
import torch
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from tqdm import tqdm

2025-09-27 14:26:20.207858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758983180.409993      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758983180.470951      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- 1. SETUP & ENVIRONMENT HARDENING (LESSONS LEARNED) ---
# This environment variable permanently disables wandb to prevent login errors.
os.environ["WANDB_DISABLED"] = "true"

# --- 2. CONFIGURATION ---
# THIS IS THE CRITICAL CORRECTION: Use the honest, original fine-tuned model.
# DO NOT use the failed augmented model.
MODEL_PATH = "/kaggle/input/emollm/codebert_finetuned_aug1" 
# Path to your full, unlabeled 20k commit dataset
DATA_FILE = "/kaggle/input/emollm/cleaned20k.csv"
# The final output file for your teammates
OUTPUT_FILE = "/kaggle/working/commits_with_emotions_final.csv"

TEXT_COLUMN = "message"
BATCH_SIZE = 64 # Larger batch size for faster inference

In [3]:
# --- 3. LOAD THE CHAMPION MODEL AND DATA ---
print(f"--- Loading champion model from: {MODEL_PATH} ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"FATAL ERROR: Could not load model from path. Check your input files. Error: {e}")
    exit()

--- Loading champion model from: /kaggle/input/emollm/codebert_finetuned_aug1 ---
Model and tokenizer loaded successfully.


In [4]:
print(f"--- Loading 20k commit dataset from: {DATA_FILE} ---")
try:
    df = pd.read_csv(DATA_FILE)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"FATAL ERROR: Data file not found at '{DATA_FILE}'. Exiting.")
    exit()

--- Loading 20k commit dataset from: /kaggle/input/emollm/cleaned20k.csv ---
Dataset loaded successfully. Shape: (19420, 5)


In [5]:
# --- 4. DATA HYGIENE (LESSON LEARNED) ---
# This prevents the TypeError during tokenization.
print("--- Enforcing data hygiene on input data ---")
initial_rows = len(df)
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str) # Force column to string
df = df[df[TEXT_COLUMN].str.strip().astype(bool)] # Drop rows with empty/whitespace messages
final_rows = len(df)
if initial_rows != final_rows:
    print(f"Warning: Dropped {initial_rows - final_rows} rows with empty messages.")

# Convert to Hugging Face Dataset for the Trainer
predict_dataset = Dataset.from_pandas(df)

--- Enforcing data hygiene on input data ---


In [6]:
# --- 5. EXECUTE BATCHED INFERENCE ---
print(f"--- Generating predictions for {len(predict_dataset)} messages... ---")
# The Trainer requires TrainingArguments, even for prediction.
# We set report_to="none" as a final safeguard.
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir="./temp_results", report_to="none")

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, max_length=512)
predict_dataset_tokenized = predict_dataset.map(tokenize_function, batched=True)

# Run prediction
predictions = trainer.predict(predict_dataset_tokenized)

--- Generating predictions for 19420 messages... ---


  trainer = Trainer(


Map:   0%|          | 0/19420 [00:00<?, ? examples/s]

In [7]:
# --- 6. PROCESS AND SAVE RESULTS ---
print("--- Inference complete. Processing and saving final results. ---")
# Get the predicted class IDs and probabilities
predicted_class_ids = np.argmax(predictions.predictions, axis=1)
probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=-1).numpy()
max_probabilities = np.max(probabilities, axis=1)

# Map IDs back to string labels
id2label = model.config.id2label
predicted_labels = [id2label[i] for i in predicted_class_ids]

# Add new columns to the original dataframe
df['emotion_predicted'] = predicted_labels
df['emotion_score'] = max_probabilities

# Save the final, enriched dataset
df.to_csv(OUTPUT_FILE, index=False)

--- Inference complete. Processing and saving final results. ---


In [8]:
print(f"\n--- MISSION COMPLETE ---")
print(f"Successfully generated predictions and saved the final dataset to '{OUTPUT_FILE}'")
print("\nSample of the final output:")
print(df[['commit', 'message', 'emotion_predicted', 'emotion_score']].head())


--- MISSION COMPLETE ---
Successfully generated predictions and saved the final dataset to '/kaggle/working/commits_with_emotions_final.csv'

Sample of the final output:
                                     commit  \
0  c8a5a98a200c982e586419f484df3cd7c6c41c0a   
1  e234c80cc636723fa6c4e832b34ae3f88d235323   
2  18759308740a0bfc18d754d16dfe1b49b0773aa1   
3  c94c4807e5a561dbb5fe39fe62b5e53dca7b2c15   
4  c6a68d91631d51ab973380855de8c5cdcced14f3   

                                             message emotion_predicted  \
0  roll fuchsia sdk from to if this roll has caus...           caution   
1  mark instantstarttesttestfeedplaceholdervisibi...           neutral   
2  android rename android_webview glue_java allow...           neutral   
3  browseraccessibilityinstance_active is not nee...      satisfaction   
4  roll srcinternal from to deceec revision if th...           caution   

   emotion_score  
0       0.999386  
1       0.998442  
2       0.593868  
3       0.941682  
4     