# TF-IDF From Scratch - Module Notebook

This notebook contains the complete implementation of TF-IDF from scratch and related utilities as used in the LearnMateAI project. You can run cells interactively to see and test the logic.

---



In [None]:
# Imports used in TF-IDF from scratch
import re
import math
import os
import glob
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Set
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    PLOTTING_AVAILABLE = True
except ImportError:
    PLOTTING_AVAILABLE = False



## TFIDFVectorizer Implementation
This class builds vocabulary, computes document frequency, IDF, and creates TF-IDF vectors from scratch. No scikit-learn TF-IDF vectorizer is used here.


In [None]:
class TFIDFVectorizer:
    """
    TF-IDF Vectorizer implemented from scratch
    """
    def __init__(self, max_features: int = None, min_df: int = 1, max_df: float = 1.0, 
                 stop_words: Set[str] = None, lowercase: bool = True):
        self.max_features = max_features
        self.min_df = min_df
        self.max_df = max_df
        self.stop_words = stop_words or set()
        self.lowercase = lowercase
        self.vocabulary_ = {}
        self.idf_ = {}
        self.feature_names_ = []
        self.n_documents_ = 0
    
    def _preprocess_text(self, text: str) -> List[str]:
        if self.lowercase:
            text = text.lower()
        tokens = re.findall(r'\b\w+\b', text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens
    
    def _build_vocabulary(self, documents: List[str]) -> Dict[str, int]:
        doc_freq = defaultdict(int)
        all_terms = []
        for doc in documents:
            tokens = self._preprocess_text(doc)
            unique_tokens = set(tokens)
            for token in unique_tokens:
                doc_freq[token] += 1
            all_terms.extend(tokens)
        self.n_documents_ = len(documents)
        min_doc_freq = self.min_df if isinstance(self.min_df, int) else int(self.min_df * self.n_documents_)
        max_doc_freq = self.max_df if isinstance(self.max_df, int) else int(self.max_df * self.n_documents_)
        filtered_vocab = {}
        for term, df in doc_freq.items():
            if min_doc_freq <= df <= max_doc_freq:
                filtered_vocab[term] = df
        sorted_vocab = sorted(filtered_vocab.items(), key=lambda x: x[1], reverse=True)
        if self.max_features:
            sorted_vocab = sorted_vocab[:self.max_features]
        vocabulary = {term: idx for idx, (term, _) in enumerate(sorted_vocab)}
        self.feature_names_ = [term for term, _ in sorted_vocab]
        return vocabulary
    
    def _calculate_idf(self, documents: List[str]) -> Dict[str, float]:
        doc_freq = defaultdict(int)
        for doc in documents:
            tokens = self._preprocess_text(doc)
            unique_tokens = set(tokens)
            for token in unique_tokens:
                if token in self.vocabulary_:
                    doc_freq[token] += 1
        idf = {}
        for term in self.vocabulary_:
            df = doc_freq.get(term, 1)
            idf[term] = math.log(self.n_documents_ / (df + 1))
        return idf
    
    def fit(self, documents: List[str]):
        self.vocabulary_ = self._build_vocabulary(documents)
        self.idf_ = self._calculate_idf(documents)
        return self
    
    def _calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
        term_counts = Counter(tokens)
        total_terms = len(tokens)
        if total_terms == 0:
            return {}
        tf = {}
        for term, count in term_counts.items():
            if term in self.vocabulary_:
                tf[term] = count / total_terms
        return tf
    
    def transform(self, documents: List[str]) -> np.ndarray:
        vectors = []
        for idx, doc in enumerate(documents):
            tokens = self._preprocess_text(doc)
            tf = self._calculate_tf(tokens)
            vector = np.zeros(len(self.vocabulary_))
            for term, tf_val in tf.items():
                term_idx = self.vocabulary_[term]
                idf_val = self.idf_[term]
                vector[term_idx] = tf_val * idf_val
            vectors.append(vector)
        return np.array(vectors)
    
    def fit_transform(self, documents: List[str]) -> np.ndarray:
        self.fit(documents)
        return self.transform(documents)



## Example: Using TFIDFVectorizer
Now let's use the above class on a sample set of documents and view the resulting TF-IDF vectors.


In [None]:
# Sample test of TFIDFVectorizer
sample_docs = [
    "Machine learning is a subset of artificial intelligence.",
    "Python is a popular programming language for data science.",
    "Artificial intelligence and machine learning are related fields."
]

vectorizer = TFIDFVectorizer()
vectorizer.fit(sample_docs)
tfidf_matrix = vectorizer.transform(sample_docs)

print("Vocabulary:", vectorizer.vocabulary_)
print("TF-IDF Matrix:\n", tfidf_matrix)



---

Continue copying important classes/methods from your module in this notebook. You can now run and demonstrate the TF-IDF algorithm step-by-step. If you want the full notebook for utils.py or additional classes, let me know!


## Naive Bayes Text Classifier
This class implements a simple Naive Bayes classifier for text classification using custom TF-IDF vectors as input.


In [None]:
class NaiveBayesClassifier:
    """
    Naive Bayes Classifier for text classification using TF-IDF features
    """
    def __init__(self, alpha: float = 1.0):
        self.alpha = alpha
        self.classes_ = None
        self.class_priors_ = {}
        self.feature_probs_ = {}  # {class: {feature_idx: probability}}
        self.n_features_ = 0

    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 1):
        self.classes_ = np.unique(y)
        self.n_features_ = X.shape[1]
        n_samples = X.shape[0]
        for class_label in self.classes_:
            class_mask = y == class_label
            self.class_priors_[class_label] = np.sum(class_mask) / n_samples
        self.feature_probs_ = {class_label: {} for class_label in self.classes_}
        for epoch in range(epochs):
            for class_label in self.classes_:
                class_mask = y == class_label
                class_samples = X[class_mask]
                feature_sums = np.sum(class_samples, axis=0)
                total_sum = np.sum(feature_sums) + self.alpha * self.n_features_
                for feature_idx in range(self.n_features_):
                    feature_sum = feature_sums[feature_idx]
                    prob = (feature_sum + self.alpha) / total_sum
                    self.feature_probs_[class_label][feature_idx] = prob
        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        n_samples = X.shape[0]
        n_classes = len(self.classes_)
        probabilities = np.zeros((n_samples, n_classes))
        for i, sample in enumerate(X):
            class_probs = []
            for class_label in self.classes_:
                log_prob = math.log(self.class_priors_[class_label])
                for feature_idx in range(self.n_features_):
                    feature_value = sample[feature_idx]
                    if feature_value > 0:
                        feature_prob = self.feature_probs_[class_label].get(feature_idx, self.alpha)
                        log_prob += feature_value * math.log(feature_prob + 1e-10)
                class_probs.append(log_prob)
            class_probs = np.array(class_probs)
            class_probs = class_probs - np.max(class_probs)
            class_probs = np.exp(class_probs)
            probabilities[i] = class_probs / np.sum(class_probs)
        return probabilities

    def predict(self, X: np.ndarray) -> np.ndarray:
        probabilities = self.predict_proba(X)
        predictions = self.classes_[np.argmax(probabilities, axis=1)]
        return predictions



## Testing the Naive Bayes Classifier with TF-IDF
Below is an example of how to use the Naive Bayes classifier on a toy dataset with TF-IDF features.


In [None]:
# Sample documents and labels
sample_docs = [
    "Python is a powerful programming language.",
    "Machine learning uses algorithms for data analysis.",
    "Artificial intelligence and data science are closely related.",
    "Programming in Python is fun.",
    "Data analysis is a key part of data science."
]
sample_labels = ["Programming", "ML", "ML", "Programming", "ML"]

vectorizer = TFIDFVectorizer()
X_tfidf = vectorizer.fit_transform(sample_docs)
y = np.array(sample_labels)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.4, random_state=42, stratify=y)

# Training Naive Bayes classifier
classifier = NaiveBayesClassifier()
classifier.fit(X_train, y_train)

# Predictions and evaluation
train_pred = classifier.predict(X_train)
test_pred = classifier.predict(X_test)

print("TRAINING PREDICTION:", train_pred)
print("TEST PREDICTION:", test_pred)
print("TRAIN ACCURACY:", accuracy_score(y_train, train_pred))
print("TEST ACCURACY:", accuracy_score(y_test, test_pred))


---




## Helper Functions: Data Analysis and Confusion Matrix
These utilities help analyze datasets and visualize results, useful for both demo and evaluation.


In [None]:
def analyze_data(documents, labels):
    """Analyzes class distribution and document lengths."""
    from collections import Counter
    print("Class distribution:")
    print(Counter(labels))
    doc_lens = [len(d.split()) for d in documents]
    print("Avg words per doc:", np.mean(doc_lens))
    print("Min:", np.min(doc_lens), "Max:", np.max(doc_lens))
    print("Stddev:", np.std(doc_lens))
    print("Vocabulary size:", len(set(' '.join(documents).split())))


In [None]:
def plot_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    plt.figure(figsize=(5,4))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    for i in range(len(classes)):
        for j in range(len(classes)):
            plt.text(j, i, cm[i, j], ha="center", va="center", color="white" if cm[i, j] > cm.max()/2 else "black")
    plt.show()


## Helper Function: Default English Stopwords
You can optionally use this for stopword removal in your vectorizer.


In [None]:
def load_default_stopwords():
    return {
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
        'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
        'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they',
        'have', 'had', 'what', 'said', 'each', 'which', 'their', 'if',
        'up', 'out', 'many', 'then', 'them', 'these', 'so', 'some', 'her',
        'would', 'make', 'like', 'into', 'him', 'has', 'two', 'more',
        'very', 'after', 'words', 'long', 'than', 'first', 'been', 'call',
        'who', 'oil', 'sit', 'now', 'find', 'down', 'day', 'did', 'get',
        'come', 'made', 'may', 'part'}
