In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm
import numpy as np
import nltk
import gensim.downloader as api
import random
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Download the dataset if you haven't already
nltk.download('brown', quiet=True)
nltk.download('universal_tagset', quiet=True)

# Set a random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 1. Data Loading and Vocabulary Creation ---
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')

all_words = [word.lower() for sent in corpus for word, tag in sent]
all_tags = [tag for sent in corpus for word, tag in sent]

word_counts = Counter(all_words)
tag_counts = Counter(all_tags)

word_to_idx = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1

tag_to_idx = {tag: i+1 for i, (tag, _) in enumerate(tag_counts.items())}
tag_to_idx['<PAD>'] = 0

idx_to_tag = {i: tag for tag, i in tag_to_idx.items()}

# --- 2. Convert Sentences to Indices and Pad ---
sequences = []
for sent in corpus:
    word_indices = [word_to_idx.get(word.lower(), word_to_idx['<UNK>']) for word, tag in sent]
    tag_indices = [tag_to_idx[tag] for word, tag in sent]
    sequences.append((word_indices, tag_indices))

MAX_LEN = max(len(s) for s, t in sequences)
print(f"Maximum sentence length: {MAX_LEN}")

def pad_sequences(sequences, max_len, word_pad_idx, tag_pad_idx):
    padded_sents = []
    padded_tags = []
    for s, t in sequences:
        padded_s = s + [word_pad_idx] * (max_len - len(s))
        padded_t = t + [tag_pad_idx] * (max_len - len(t))
        padded_sents.append(padded_s)
        padded_tags.append(padded_t)
    return np.array(padded_sents), np.array(padded_tags)

padded_sents, padded_tags = pad_sequences(sequences, MAX_LEN, word_to_idx['<PAD>'], tag_to_idx['<PAD>'])

# --- 3. Train, Validation, Test Split ---
X_train, X_test, y_train, y_test = train_test_split(padded_sents, padded_tags, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=SEED)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

# --- 4. Create PyTorch Dataset and DataLoader ---
class PosTaggingDataset(Dataset):
    def __init__(self, sentences, tags):
        self.sentences = torch.LongTensor(sentences)
        self.tags = torch.LongTensor(tags)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

train_dataset = PosTaggingDataset(X_train, y_train)
val_dataset = PosTaggingDataset(X_val, y_val)
test_dataset = PosTaggingDataset(X_test, y_test)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

Using device: cpu
Maximum sentence length: 180
Train size: 41284
Validation size: 4588
Test size: 11468


In [7]:
from collections import Counter

# Make sure 'all_tags' from your preprocessing step is available
tag_counts = Counter(all_tags)

# Print the counts of each tag, sorted by frequency
for tag, count in tag_counts.most_common():
    print(f"{tag:<6} - {count:>6} instances")

NOUN   - 275558 instances
VERB   - 182750 instances
.      - 147565 instances
ADP    - 144766 instances
DET    - 137019 instances
ADJ    -  83721 instances
ADV    -  56239 instances
PRON   -  49334 instances
CONJ   -  38151 instances
PRT    -  29829 instances
NUM    -  14874 instances
X      -   1386 instances


In [None]:
import json

# Make sure these variables are defined in your notebook's memory
# from the preprocessing steps.
artifacts = {
    'word_to_idx': word_to_idx,
    'tag_to_idx': tag_to_idx,
    'MAX_LEN': MAX_LEN
}

with open('artifacts.json', 'w') as f:
    json.dump(artifacts, f, indent=4)

print("Artifacts saved successfully to 'artifacts.json'!")


Artifacts saved successfully to 'artifacts.json'!
