# UID Test Components - For Colab

Test each function individually.

In [None]:
# SETUP

import os
import math
import random
from collections import defaultdict, Counter
from itertools import permutations
import numpy as np
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

random.seed(42)
np.random.seed(42)

In [None]:
# TEST 1: Token and Sentence classes

class Token:
    def __init__(self, id, form, lemma, upos, xpos, feats, head, deprel, deps, misc):
        self.id = id
        self.form = form
        self.lemma = lemma
        self.upos = upos
        self.xpos = xpos
        self.feats = feats
        self.head = head
        self.deprel = deprel
        self.deps = deps
        self.misc = misc

class Sentence:
    def __init__(self, sent_id, text, tokens):
        self.sent_id = sent_id
        self.text = text
        self.tokens = tokens

    def get_root(self):
        for t in self.tokens:
            if t.head == 0:
                return t
        return None

    def get_word_sequence(self):
        return [t.form for t in sorted(self.tokens, key=lambda x: x.id)]

# Test
tok1 = Token(1, 'राम', 'राम', 'PROPN', 'NNP', {}, 2, 'nsubj', '_', '_')
tok2 = Token(2, 'खाता', 'खा', 'VERB', 'VM', {}, 0, 'root', '_', '_')
sent = Sentence('s1', 'राम खाता है', [tok1, tok2])

print(f"Root: {sent.get_root().form}")
print(f"Words: {sent.get_word_sequence()}")
print('Test 1 passed!')

Root: खाता
Words: ['राम', 'खाता']
Test 1 passed!


In [None]:
# TEST 2: CoNLL-U Parser

def parse_feats(feats_str):
    feats = {}
    if feats_str and feats_str != '_':
        for feat in feats_str.split('|'):
            if '=' in feat:
                key, value = feat.split('=', 1)
                feats[key] = value
    return feats

def parse_conllu(filepath, max_sent=None):
    sentences = []
    current_tokens = []
    sent_id = ''
    text = ''

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('# sent_id'):
                sent_id = line.split('=', 1)[1].strip() if '=' in line else ''
            elif line.startswith('# text'):
                text = line.split('=', 1)[1].strip() if '=' in line else ''
            elif line == '':
                if current_tokens:
                    sentences.append(Sentence(sent_id, text, current_tokens))
                    current_tokens = []
                    sent_id = ''
                    text = ''
                    if max_sent and len(sentences) >= max_sent:
                        break
            else:
                fields = line.split('\t')
                if len(fields) == 10 and '-' not in fields[0] and '.' not in fields[0]:
                    token = Token(
                        id=int(fields[0]), form=fields[1], lemma=fields[2],
                        upos=fields[3], xpos=fields[4], feats=parse_feats(fields[5]),
                        head=int(fields[6]) if fields[6] != '_' else 0,
                        deprel=fields[7], deps=fields[8], misc=fields[9]
                    )
                    current_tokens.append(token)

    if current_tokens:
        sentences.append(Sentence(sent_id, text, current_tokens))
    return sentences

# Create test file
sample = '''# sent_id = 1
# text = राम खाता है
1\tराम\tराम\tPROPN\tNNP\tCase=Nom\t2\tnsubj\t_\t_
2\tखाता\tखा\tVERB\tVM\t_\t0\troot\t_\t_
3\tहै\tहै\tAUX\tVAUX\t_\t2\taux\t_\t_
'''
with open('/tmp/test.conllu', 'w') as f:
    f.write(sample)

sents = parse_conllu('/tmp/test.conllu')
print(f'Parsed {len(sents)} sentence(s)')
print(f'Text: {sents[0].text}')
print(f'Words: {sents[0].get_word_sequence()}')
print('Test 2 passed!')

Parsed 1 sentence(s)
Text: राम खाता है
Words: ['राम', 'खाता', 'है']
Test 2 passed!


In [None]:
# TEST 2B: CoNLL-U parsing with `conllu` library

import sys
import subprocess

try:
    from conllu import parse_incr
except ImportError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'conllu'])
    from conllu import parse_incr

with open('/tmp/test.conllu', 'r', encoding='utf-8') as f:
    parsed = list(parse_incr(f))

print(f'Parsed {len(parsed)} sentence(s) with conllu library')
print(f"Text: {parsed[0].metadata.get('text', '')}")
print(f"Words: {[tok['form'] for tok in parsed[0]]}")
print('Test 2B passed!')

Parsed 1 sentence(s) with conllu library
Text: राम खाता है
Words: ['राम', 'खाता', 'है']
Test 2B passed!


In [None]:
# TEST 3: Trigram Model

unigram_counts = Counter()
bigram_counts = Counter()
trigram_counts = Counter()
vocab = set()
total_tokens = 0

train = [['राम', 'खाता', 'है'], ['सीता', 'खाती', 'है'], ['राम', 'सोता', 'है']]

for sent in train:
    toks = ['<s>', '<s>'] + sent + ['</s>']
    for i in range(len(toks)):
        unigram_counts[toks[i]] += 1
        vocab.add(toks[i])
        if i > 0:
            bigram_counts[(toks[i-1], toks[i])] += 1
        if i > 1:
            trigram_counts[(toks[i-2], toks[i-1], toks[i])] += 1
    total_tokens += len(sent)

vocab_size = len(vocab)

def get_prob(word, prev1, prev2):
    l3, l2, l1 = 0.6, 0.3, 0.1
    tc = trigram_counts.get((prev2, prev1, word), 0)
    bc = bigram_counts.get((prev2, prev1), 0)
    p3 = tc / bc if bc > 0 else 0
    p2 = bigram_counts.get((prev1, word), 0) / unigram_counts.get(prev1, 1)
    p1 = (unigram_counts.get(word, 0) + 1) / (total_tokens + vocab_size)
    return max(l3*p3 + l2*p2 + l1*p1, 1e-10)

def compute_surprisal(word_seq):
    toks = ['<s>', '<s>'] + word_seq + ['</s>']
    return [-math.log2(get_prob(toks[i], toks[i-1], toks[i-2])) for i in range(2, len(toks))]

s = compute_surprisal(['राम', 'खाता', 'है'])
print(f'Surprisal: {[round(x, 2) for x in s]}')
print(f'Total: {round(sum(s), 2)} bits')
print('Test 3 passed!')

Surprisal: [0.95, 1.11, 0.11, 0.11]
Total: 2.29 bits
Test 3 passed!


In [None]:
# TEST 4: UID Measures

def mean_info(info_list):
    return sum(info_list) / len(info_list) if info_list else 0

def uid_global(info_list):
    if len(info_list) <= 1:
        return 0
    m = mean_info(info_list)
    return -sum((x - m)**2 for x in info_list) / len(info_list)

def uid_local(info_list):
    if len(info_list) <= 1:
        return 0
    diffs = [(info_list[i] - info_list[i-1])**2 for i in range(1, len(info_list))]
    return -sum(diffs) / len(info_list)

def uid_global_norm(info_list):
    if len(info_list) <= 1:
        return 0
    m = mean_info(info_list)
    if m == 0:
        return 0
    return -sum(((x/m) - 1)**2 for x in info_list) / len(info_list)

def uid_local_norm(info_list):
    if len(info_list) <= 1:
        return 0
    m = mean_info(info_list)
    if m == 0:
        return 0
    diffs = [((info_list[i] - info_list[i-1])**2)/(m**2) for i in range(1, len(info_list))]
    return -sum(diffs) / len(info_list)

def uid_local_prev_norm(info_list):
    if len(info_list) <= 1:
        return 0
    diffs = []
    for i in range(1, len(info_list)):
        if info_list[i-1] != 0:
            diffs.append(((info_list[i]/info_list[i-1]) - 1)**2)
    return -sum(diffs)/len(info_list) if diffs else 0

info = [2.5, 3.0, 2.8, 3.2, 2.9]
print(f'Info: {info}')
print(f'UIDglob: {round(uid_global(info), 4)}')
print(f'UIDloc: {round(uid_local(info), 4)}')
print(f'UIDglobNorm: {round(uid_global_norm(info), 4)}')
print(f'UIDlocNorm: {round(uid_local_norm(info), 4)}')
print(f'UIDlocPrevNorm: {round(uid_local_prev_norm(info), 4)}')
print('Test 4 passed!')

Info: [2.5, 3.0, 2.8, 3.2, 2.9]
UIDglob: -0.0536
UIDloc: -0.108
UIDglobNorm: -0.0065
UIDlocNorm: -0.013
UIDlocPrevNorm: -0.0147
Test 4 passed!


In [None]:
# TEST 5: Variant Generator

def get_preverbal_constituents(sent):
    root = sent.get_root()
    if not root:
        return []
    preverbal = [t for t in sent.tokens if t.id < root.id]
    constituents = defaultdict(list)
    for t in preverbal:
        constituents[t.head].append(t)
    result = []
    for head_id in sorted(constituents.keys()):
        result.append(sorted(constituents[head_id], key=lambda x: x.id))
    return result

def generate_variant(sent, perm):
    consts = get_preverbal_constituents(sent)
    root = sent.get_root()
    post = [t for t in sent.tokens if root and t.id > root.id]
    new_toks, nid = [], 1
    for ci in perm:
        if ci < len(consts):
            for t in consts[ci]:
                new_toks.append(Token(nid, t.form, t.lemma, t.upos, t.xpos, t.feats, 0, t.deprel, t.deps, t.misc))
                nid += 1
    if root:
        new_toks.append(Token(nid, root.form, root.lemma, root.upos, root.xpos, root.feats, 0, root.deprel, root.deps, root.misc))
        nid += 1
    for t in post:
        new_toks.append(Token(nid, t.form, t.lemma, t.upos, t.xpos, t.feats, t.head, t.deprel, t.deps, t.misc))
        nid += 1
    return Sentence(f"{sent.sent_id}_var", ' '.join([t.form for t in new_toks]), new_toks)

def generate_variants(sent, max_var=99):
    consts = get_preverbal_constituents(sent)
    if len(consts) <= 1:
        return []
    perms = [p for p in permutations(range(len(consts))) if p != tuple(range(len(consts)))]
    if len(perms) > max_var:
        random.shuffle(perms)
        perms = perms[:max_var]
    return [generate_variant(sent, p) for p in perms]

t1 = Token(1, 'राम', 'राम', 'PROPN', 'NNP', {}, 4, 'nsubj', '_', '_')
t2 = Token(2, 'सेब', 'सेब', 'NOUN', 'NN', {}, 4, 'obj', '_', '_')
t3 = Token(3, 'खाता', 'खा', 'VERB', 'VM', {}, 0, 'root', '_', '_')
sent = Sentence('s1', 'राम सेब खाता है', [t1, t2, t3])

vars = generate_variants(sent, 5)
print(f'Original: {sent.text}')
print(f'Generated {len(vars)} variants:')
for v in vars:
    print(f'  {v.text}')
print('Test 5 passed!')

Original: राम सेब खाता है
Generated 0 variants:
Test 5 passed!


In [None]:
# TEST 6: Pairwise Classifier

def make_diff(f1, f2):
    return [f2[k] - f1[k] for k in sorted(f1.keys())]

def create_pairs(corp, var):
    X, y = [], []
    for c, v in zip(corp, var):
        X.append(make_diff(c, v))
        y.append(0)
    for c, v in zip(corp, var):
        X.append(make_diff(v, c))
        y.append(1)
    return np.array(X), np.array(y)

corp = [{'s': 10, 'u': -5}, {'s': 12, 'u': -6}]
var = [{'s': 15, 'u': -3}, {'s': 18, 'u': -2}]

X, y = create_pairs(corp, var)
clf = LogisticRegression(solver='lbfgs', max_iter=100)
clf.fit(X, y)
acc = clf.score(X, y)
print(f'Accuracy: {acc*100:.1f}%')
print('Test 6 passed!')

Accuracy: 100.0%
Test 6 passed!


In [None]:
# TEST 7: Cross Validation

def cross_val(X, y, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    accs = []
    for tr, te in kf.split(X):
        clf = LogisticRegression(solver='lbfgs', max_iter=100)
        clf.fit(X[tr], y[tr])
        accs.append(clf.score(X[te], y[te]))
    return np.mean(accs)

X = np.random.randn(20, 3)
y = np.random.randint(0, 2, 20)
acc = cross_val(X, y, 5)
print(f'CV Accuracy: {acc*100:.1f}%')
print('Test 7 passed!')

CV Accuracy: 55.0%
Test 7 passed!


In [None]:
# TEST 8: Correlation

def corr(x, y):
    cx = [xi for xi, yi in zip(x, y) if not (np.isnan(xi) or np.isnan(yi))]
    cy = [yi for xi, yi in zip(x, y) if not (np.isnan(xi) or np.isnan(yi))]
    if len(cx) < 2:
        return 0
    return stats.pearsonr(cx, cy)[0]

x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]
print(f'Correlation: {corr(x, y):.4f}')
print('Test 8 passed!')

print('\n=== ALL TESTS PASSED ===')

Correlation: 1.0000
Test 8 passed!

=== ALL TESTS PASSED ===
