In [1]:
# Install the necessary library
!pip install simpletransformers



In [1]:
# 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 3. Verify we can see your files
import os

# Define the path to your folder
# (Note: We use a formatted string because of the spaces in "data for nlp")
folder_path = '/content/drive/MyDrive/data for nlp'

# Check if the folder exists and list files
if os.path.exists(folder_path):
    print("\n✅ Folder found! Here are the files inside:")
    for file in os.listdir(folder_path):
        print(f" - {file}")
else:
    print(f"\n❌ Error: Could not find folder at {folder_path}")
    print("Please check if the folder name is exactly 'data for nlp' (case sensitive).")

Mounted at /content/drive

✅ Folder found! Here are the files inside:
 - test_df.csv
 - val_split.csv
 - train_split.csv


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# ==============================================================================
# STEP 2: LOAD AND INSPECT DATA
# ==============================================================================

# Define the folder path
DATA_FOLDER = '/content/drive/MyDrive/data for nlp'

# 1. Load the three datasets
print("Loading datasets...")
try:
    df_train = pd.read_csv(f'{DATA_FOLDER}/train_split.csv')
    df_val   = pd.read_csv(f'{DATA_FOLDER}/val_split.csv')
    df_test  = pd.read_csv(f'{DATA_FOLDER}/test_df.csv')
    print("✅ Success! All files loaded.\n")
except FileNotFoundError as e:
    print(f"❌ Error: {e}")

# 2. Inspect the Columns
# We need to know the EXACT names of the columns to tell BERT where the text is.
print("--- COLUMN NAMES ---")
print(f"Train columns: {df_train.columns.tolist()}")
print(f"Val columns:   {df_val.columns.tolist()}")
print(f"Test columns:  {df_test.columns.tolist()}")

# 3. Inspect a Sample Row
print("\n--- SAMPLE ROW (TRAIN) ---")
print(df_train.iloc[0])

Loading datasets...
✅ Success! All files loaded.

--- COLUMN NAMES ---
Train columns: ['id', 'text', 'text_with_markers', 'relation_label', 'relation_type', 'entity1_text', 'entity2_text', 'label_id', 'num_tokens']
Val columns:   ['id', 'text', 'text_with_markers', 'relation_label', 'relation_type', 'entity1_text', 'entity2_text', 'label_id', 'num_tokens']
Test columns:  ['id', 'text', 'text_with_markers', 'relation_label', 'relation_type', 'entity1_text', 'entity2_text', 'label_id', 'num_tokens']

--- SAMPLE ROW (TRAIN) ---
id                                                                1939
text                 Even travel agents are turning to telepresence...
text_with_markers    Even [E1]travel agents[/E1] are turning to [E2...
relation_label                                Instrument-Agency(e2,e1)
relation_type                                        Instrument-Agency
entity1_text                                             travel agents
entity2_text                               

In [4]:
import pandas as pd
import numpy as np
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
from scipy.special import softmax

# ==============================================================================
# STEP 3: FORMAT DATA & TRAIN MODEL
# ==============================================================================

print("--- 1. PREPARING DATA ---")

# A. Define the Column Mapping
# We map your specific column names to what the library expects ("text" and "labels")
TEXT_COLUMN = 'text_with_markers'
LABEL_COLUMN = 'relation_type'

# B. Create the Label Encoder (Text -> Number)
# We combine all labels from all sets to make sure we have a complete list
all_labels = sorted(list(set(df_train[LABEL_COLUMN].unique()) |
                         set(df_val[LABEL_COLUMN].unique()) |
                         set(df_test[LABEL_COLUMN].unique())))

# Create a dictionary: {'Cause-Effect': 0, 'Component-Whole': 1, ...}
label_map = {label: i for i, label in enumerate(all_labels)}

print(f"Detected {len(all_labels)} unique relations.")
print(f"Label Map (First 3): {dict(list(label_map.items())[:3])}")

# C. Format the Dataframes
def format_for_bert(df):
    return pd.DataFrame({
        'text': df[TEXT_COLUMN].astype(str),
        'labels': df[LABEL_COLUMN].map(label_map)
    })

train_df = format_for_bert(df_train)
eval_df  = format_for_bert(df_val)
test_df  = format_for_bert(df_test)

print(f"Training on {len(train_df)} samples.")
print(f"Validating on {len(eval_df)} samples.")
print(f"Testing on {len(test_df)} samples.")


# ==============================================================================
# STEP 4: CONFIGURE & TRAIN ROBERTA
# ==============================================================================
print("\n--- 2. INITIALIZING & TRAINING MODEL ---")

# Configure the model (Standard settings for Relation Extraction)
model_args = ClassificationArgs()
model_args.num_train_epochs = 4              # 4 loops through the data
model_args.train_batch_size = 16             # Standard batch size for GPU
model_args.eval_batch_size = 32
model_args.overwrite_output_dir = True       # Overwrite previous runs
model_args.save_model_every_epoch = False    # Save space
model_args.learning_rate = 3e-5              # Optimal learning rate for RoBERTa
model_args.max_seq_length = 128              # Max length of sentences

# Initialize RoBERTa
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=len(all_labels),
    args=model_args,
    use_cuda=torch.cuda.is_available()       # Use GPU if available
)

# TRAIN! (This will take 5-10 minutes)
# We use eval_df to monitor performance during training
model.train_model(train_df, eval_data=eval_df)


# ==============================================================================
# STEP 5: FINAL EVALUATION (MILESTONE 2 RESULT)
# ==============================================================================
print("\n--- 3. EVALUATING ON TEST SET ---")

# Predict on the Test Set
predictions, raw_outputs = model.predict(test_df['text'].tolist())

# Convert numbers back to text labels (0 -> 'Cause-Effect')
pred_names = [all_labels[p] for p in predictions]
true_names = [all_labels[l] for l in test_df['labels'].tolist()]

# PRINT THE REPORT
# This is the table you need for your Milestone 2 submission
print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(true_names, pred_names))


# ==============================================================================
# STEP 6: SAVE ARTIFACTS (FOR MILESTONE 3)
# ==============================================================================
print("\n--- 4. SAVING HIGH-CONFIDENCE PREDICTIONS ---")

# Calculate probability %
probs = softmax(raw_outputs, axis=1)
confidence = np.max(probs, axis=1)

# Create a results DataFrame
results_df = pd.DataFrame({
    'text': df_test['text'],         # Original raw text (cleaner for rules)
    'true_label': true_names,
    'predicted_label': pred_names,
    'confidence': confidence
})

# Filter: Correctly predicted AND >95% confident
high_conf_df = results_df[
    (results_df['predicted_label'] == results_df['true_label']) &
    (results_df['confidence'] > 0.95)
]

# Save to CSV
high_conf_df.to_csv("high_confidence_predictions.csv", index=False)
print(f"✅ Saved {len(high_conf_df)} high-confidence examples to 'high_confidence_predictions.csv'")

--- 1. PREPARING DATA ---
Detected 10 unique relations.
Label Map (First 3): {'Cause-Effect': 0, 'Component-Whole': 1, 'Content-Container': 2}
Training on 6800 samples.
Validating on 1200 samples.
Testing on 2717 samples.

--- 2. INITIALIZING & TRAINING MODEL ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 4:   0%|          | 0/425 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 4:   0%|          | 0/425 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/425 [00:00<?, ?it/s]

Running Epoch 4 of 4:   0%|          | 0/425 [00:00<?, ?it/s]


--- 3. EVALUATING ON TEST SET ---


  0%|          | 0/5 [00:00<?, ?it/s]

Predicting:   0%|          | 0/85 [00:00<?, ?it/s]

  with amp.autocast():



=== CLASSIFICATION REPORT ===
                    precision    recall  f1-score   support

      Cause-Effect       0.94      0.95      0.94       328
   Component-Whole       0.87      0.91      0.89       312
 Content-Container       0.90      0.89      0.90       192
Entity-Destination       0.91      0.95      0.93       292
     Entity-Origin       0.87      0.90      0.89       258
 Instrument-Agency       0.80      0.79      0.79       156
 Member-Collection       0.83      0.88      0.86       233
     Message-Topic       0.85      0.95      0.90       261
             Other       0.69      0.57      0.62       454
  Product-Producer       0.85      0.87      0.86       231

          accuracy                           0.85      2717
         macro avg       0.85      0.86      0.86      2717
      weighted avg       0.84      0.85      0.85      2717


--- 4. SAVING HIGH-CONFIDENCE PREDICTIONS ---
✅ Saved 1970 high-confidence examples to 'high_confidence_predictions.csv'
