In [1]:
!pip install datasets snorkel wandb torch torchvision transformers



In [2]:
import wandb
import pandas as pd
import re
from snorkel.labeling import labeling_function, PandasLFApplier
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from snorkel.labeling.model import MajorityLabelVoter

In [3]:
wandb.init(project="Q1-weak-supervision-ner")

[34m[1mwandb[0m: Currently logged in as: [33m142502005[0m ([33m142502005-iit-palakkad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
!pip install datasets==3.6.0




# Q1.
Use the CoNLL-2003 Named Entity Recognition dataset which contains four entity types:
- PER (Person names)
- LOC (Locations)
- ORG (Organizations)
- MISC (Miscellaneous entities)
- Load the CoNLL-2003 dataset using HuggingFace datasets (https://huggingface.co/datasets/eriktks/conll2003) and initialize a Weights & Biases project called "Q1-weak-supervision-ner". Log the dataset statistics (number of samples, entity distribution) to W&B as summary metrics

In [5]:
from datasets import load_dataset
dataset = load_dataset("conll2003")
train_data = dataset["train"]
print(train_data[0])


The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] Y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


# Q2.
Implement two basic labeling functions using Snorkel AI:
- a. A heuristic function detecting years (1900-2099) as potential DATE/MISC entities
- b. A pattern-matching function identifying organizations by common suffixes ("Inc.", "Corp.", "Ltd.")

Log each labeling function's coverage and accuracy to W&B using wandb.log()

In [6]:
# Convert to pandas for Snorkel
df = pd.DataFrame({'tokens': [' '.join(x) for x in train_data['tokens']],'ner_tags': [x for x in train_data['ner_tags']]})

In [7]:
# Compute simple stats
num_samples = len(df)
unique_labels = set([tag for sublist in df['ner_tags'] for tag in sublist])
entity_counts = {}
for row in df['ner_tags']:
    for tag in row:
        entity_counts[tag] = entity_counts.get(tag, 0) + 1



In [8]:
# Convert entity_counts keys to strings before logging
entity_counts_str = {str(k): v for k, v in entity_counts.items()}

wandb.summary['num_samples'] = num_samples
wandb.summary['unique_entities'] = len(unique_labels)
wandb.summary['entity_distribution'] = entity_counts_str

print("Dataset loaded")
print("Samples:", num_samples)
print("Entity types:", unique_labels)



Dataset loaded
Samples: 14041
Entity types: {0, 1, 2, 3, 4, 5, 6, 7, 8}


In [9]:
ABSTAIN = -1
MISC = 0
ORG = 1

@labeling_function()
def lf_year_detector(x):
    """Detect years between 1900 and 2099 as MISC"""
    if re.search(r"\b(19\d{2}|20\d{2})\b", x.tokens):
        return MISC
    else:
        return ABSTAIN

@labeling_function()
def lf_org_suffix(x):
    """Detect organizations by suffixes"""
    if re.search(r"\b(Inc\.|Corp\.|Ltd\.)\b", x.tokens):
        return ORG
    else:
        return ABSTAIN

lfs = [lf_year_detector, lf_org_suffix]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

# Calculate LF stats
coverage = (L_train != ABSTAIN).mean(axis=0)
wandb.log({"lf_year_coverage": coverage[0],"lf_org_coverage": coverage[1]})
print("Labeling functions applied")

100%|██████████| 14041/14041 [00:00<00:00, 41391.20it/s]

Labeling functions applied





# Q3.
Implement Snorkell’s Label aggregation (Majority Label Voter)

In [10]:
majority_model = MajorityLabelVoter()
df['label'] = majority_model.predict(L=L_train)
print("Aggregated Labels ")
wandb.log({"aggregated_labels_count": df['label'].value_counts().to_dict()})

Aggregated Labels 


In [11]:
wandb.finish()
wandb.init(project="Q1-cifar-experiments")

0,1
lf_org_coverage,▁
lf_year_coverage,▁

0,1
lf_org_coverage,0.0
lf_year_coverage,0.10199
num_samples,14041.0
unique_entities,9.0


# Q4.
Implement the following in Weights and Bias:
- a. Train CIFAR 100 and CIFAR 10 sequentially for 100 epochs
- b. Train CIFAR 10 and CIFAR 100 sequentially for 100 epochs.

Write your observations with experimental proof.

In [12]:
# Define common training utilities
device = 'cuda' if torch.cuda.is_available() else 'cpu'
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])
def get_loader(dataset_name, batch_size=64):
    if dataset_name == "CIFAR10":
        dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    else:
        dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [13]:
# Simple CNN
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 64 * 8 * 8)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [14]:
def train_model(dataset_name, next_dataset_name, epochs=100):
    loader = get_loader(dataset_name)
    next_loader = get_loader(next_dataset_name)
    model = SimpleCNN(num_classes=10 if dataset_name == "CIFAR10" else 100).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        wandb.log({f"{dataset_name}_train_loss": running_loss / len(loader)})
    print(f"Finished training {dataset_name}, now training {next_dataset_name}")

In [15]:
# Train CIFAR100 then CIFAR10
train_model("CIFAR100", "CIFAR10")

100%|██████████| 169M/169M [00:02<00:00, 76.7MB/s]
100%|██████████| 170M/170M [00:02<00:00, 78.5MB/s]


Finished training CIFAR100, now training CIFAR10


In [16]:
# Train CIFAR10 then CIFAR100
train_model("CIFAR10", "CIFAR100")
print("Both experiment sequences completed")
wandb.finish()

Finished training CIFAR10, now training CIFAR100
Both experiment sequences completed


0,1
CIFAR100_train_loss,█▇▇▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
CIFAR10_train_loss,█▇▅▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
CIFAR100_train_loss,0.33831
CIFAR10_train_loss,0.04663
