### Text Categorization


In [1]:
import numpy as np
from collections import Counter, defaultdict
import math, re

In [2]:
documents = [
    "Cats are playful and love sleeping",
    "Dogs are loyal and friendly animals",
    "Many families keep cats or dogs as pets",
    "A kitten is a young domestic cat",
    "This car is very fast on the highway",
    "Electric vehicles are eco friendly",
    "The new sports car has a powerful engine",
    "Cars consume fuel or electricity to run"
]

labels = [
    "animal", "animal", "animal", "animal",
    "vehicle", "vehicle", "vehicle", "vehicle"
]

#### Preprocessing & Vocabulary Building

In [3]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

docs_tokens = [preprocess(doc) for doc in documents]

# Vocabulary
vocab = sorted(set(word for doc in docs_tokens for word in doc))
print("Vocabulary:", vocab)

Vocabulary: ['a', 'and', 'animals', 'are', 'as', 'car', 'cars', 'cat', 'cats', 'consume', 'dogs', 'domestic', 'eco', 'electric', 'electricity', 'engine', 'families', 'fast', 'friendly', 'fuel', 'has', 'highway', 'is', 'keep', 'kitten', 'love', 'loyal', 'many', 'new', 'on', 'or', 'pets', 'playful', 'powerful', 'run', 'sleeping', 'sports', 'the', 'this', 'to', 'vehicles', 'very', 'young']


####  Naive Bayes Classification


In [4]:
def train_nb(docs_tokens, labels, vocab):
    classes = set(labels)
    priors = {c: labels.count(c)/len(labels) for c in classes}
    
    word_counts = {c: defaultdict(int) for c in classes}
    total_words = {c: 0 for c in classes}
    
    for tokens, lab in zip(docs_tokens, labels):
        for w in tokens:
            word_counts[lab][w] += 1
            total_words[lab] += 1
    
    V = len(vocab)
    likelihood = {c: {} for c in classes}
    for c in classes:
        for w in vocab:
            likelihood[c][w] = (word_counts[c][w]+1)/(total_words[c]+V)
    
    return priors, likelihood

def predict_nb(tokens, priors, likelihood, vocab):
    scores = {}
    for c in priors:
        score = math.log(priors[c])
        for w in tokens:
            if w in vocab:
                score += math.log(likelihood[c].get(w, 1/len(vocab)))
        scores[c] = score
    return max(scores, key=scores.get)

priors, likelihood = train_nb(docs_tokens, labels, vocab)
nb_preds = [predict_nb(doc, priors, likelihood, vocab) for doc in docs_tokens]
print("Naive Bayes Predictions:", nb_preds)

Naive Bayes Predictions: ['animal', 'animal', 'animal', 'animal', 'vehicle', 'vehicle', 'vehicle', 'vehicle']


#### KNN Classification


In [5]:
def vectorize(tokens, vocab):
    vec = np.zeros(len(vocab))
    cnt = Counter(tokens)
    for i, w in enumerate(vocab):
        vec[i] = cnt[w]
    return vec

doc_vectors = np.array([vectorize(doc, vocab) for doc in docs_tokens])

def cosine_sim(v1, v2):
    if np.linalg.norm(v1)==0 or np.linalg.norm(v2)==0:
        return 0
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

def predict_knn(test_vec, train_vecs, labels, k=3):
    sims = [cosine_sim(test_vec, v) for v in train_vecs]
    top_idx = np.argsort(sims)[-k:]
    top_labels = [labels[i] for i in top_idx]
    return Counter(top_labels).most_common(1)[0][0]

knn_preds = [predict_knn(v, doc_vectors, labels, k=3) for v in doc_vectors]
print("KNN Predictions:", knn_preds)

KNN Predictions: ['animal', 'animal', 'animal', 'vehicle', 'vehicle', 'animal', 'vehicle', 'vehicle']


#### Rocchio Classification

In [6]:

def train_rocchio(vectors, labels):
    classes = set(labels)
    centroids = {}
    for c in classes:
        idxs = [i for i, l in enumerate(labels) if l==c]
        centroids[c] = np.mean(vectors[idxs], axis=0)
    return centroids

def predict_rocchio(vec, centroids):
    sims = {c: cosine_sim(vec, centroids[c]) for c in centroids}
    return max(sims, key=sims.get)

centroids = train_rocchio(doc_vectors, labels)
roc_preds = [predict_rocchio(v, centroids) for v in doc_vectors]
print("Rocchio Predictions:", roc_preds)


Rocchio Predictions: ['animal', 'animal', 'animal', 'animal', 'vehicle', 'vehicle', 'vehicle', 'vehicle']


#### Decision Tree Classification

In [7]:

def entropy(labels):
    total = len(labels)
    counts = Counter(labels)
    return -sum((c/total)*math.log2(c/total) for c in counts.values())

def info_gain(labels, feature_presence):
    total_ent = entropy(labels)
    yes = [l for l, pres in zip(labels, feature_presence) if pres]
    no = [l for l, pres in zip(labels, feature_presence) if not pres]
    gain = total_ent
    if yes:
        gain -= (len(yes)/len(labels))*entropy(yes)
    if no:
        gain -= (len(no)/len(labels))*entropy(no)
    return gain

def train_dt(docs_tokens, labels, vocab, depth=1, max_depth=3):
    if len(set(labels))==1 or depth>max_depth:
        return Counter(labels).most_common(1)[0][0]
    
    best_word, best_gain = None, -1
    for w in vocab:
        presence = [w in doc for doc in docs_tokens]
        g = info_gain(labels, presence)
        if g > best_gain:
            best_gain, best_word = g, w
    
    if not best_word:
        return Counter(labels).most_common(1)[0][0]
    
    tree = {best_word:{}}
    yes_idx = [i for i, doc in enumerate(docs_tokens) if best_word in doc]
    no_idx = [i for i in range(len(docs_tokens)) if i not in yes_idx]
    
    tree[best_word]['yes'] = train_dt([docs_tokens[i] for i in yes_idx],
                                      [labels[i] for i in yes_idx], vocab, depth+1, max_depth)
    tree[best_word]['no'] = train_dt([docs_tokens[i] for i in no_idx],
                                     [labels[i] for i in no_idx], vocab, depth+1, max_depth)
    return tree

def predict_dt(tree, doc):
    if not isinstance(tree, dict):
        return tree
    word = list(tree.keys())[0]
    return predict_dt(tree[word]['yes'], doc) if word in doc else predict_dt(tree[word]['no'], doc)

dtree = train_dt(docs_tokens, labels, vocab)
dt_preds = [predict_dt(dtree, doc) for doc in docs_tokens]
print("Decision Tree Predictions:", dt_preds)


Decision Tree Predictions: ['animal', 'animal', 'animal', 'animal', 'vehicle', 'vehicle', 'vehicle', 'vehicle']
