# Document Classifier Training Notebook (Updated)

This notebook trains a robust document classifier with:
- Better handling of small datasets
- Improved text preprocessing
- Enhanced evaluation metrics
- Model persistence

In [13]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\use\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [20]:
# Document Classifier Training Script (Fixed)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import os
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources with proper error handling
def download_nltk_data():
    """Download required NLTK data with error handling"""
    required_packages = ['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger']
    
    for package in required_packages:
        try:
            nltk.data.find(f'tokenizers/{package}')
        except LookupError:
            print(f"Downloading {package}...")
            nltk.download(package, quiet=True)
        except:
            try:
                nltk.data.find(f'corpora/{package}')
            except LookupError:
                print(f"Downloading {package}...")
                nltk.download(package, quiet=True)
            except:
                try:
                    nltk.data.find(f'taggers/{package}')
                except LookupError:
                    print(f"Downloading {package}...")
                    nltk.download(package, quiet=True)
                except:
                    pass

# Initialize NLTK
download_nltk_data()

print("Setting up document classifier...")

# Enhanced training data with realistic examples
data = [
    # Invoices
    ("invoice", "Invoice No: INV-12345\nDate: 2023-01-15\nTo: ABC Company\nDescription: Web Development Services\nTotal: $1,200.00"),
    ("invoice", "TAX INVOICE\nNumber: 98765\nIssued: 15/01/2023\nClient: XYZ Corp\nAmount Due: $850.00\nPayment Due: 30 days"),
    ("invoice", "INVOICE\nFrom: Smith Services\nTo: Johnson Ltd\nDate: January 15, 2023\nSubtotal: $750.00\nTax: $75.00\nTotal: $825.00"),
    ("invoice", "SERVICE INVOICE\nInvoice #: 45678\nDate: 2023-02-01\nCustomer: Global Tech\nDescription: Monthly Maintenance\nTotal: $500.00"),
    ("invoice", "PRO FORMA INVOICE\nRef: PF-2023-03\nDate: 01-Mar-2023\nBill To: Continental Ltd\nAmount: $1,450.00"),
    ("invoice", "COMMERCIAL INVOICE\nInvoice Number: CI-78901\nDate: 15-Mar-2023\nShip To: Oceanic Trading\nTotal Value: $2,300.00"),
    
    # Resumes
    ("resume", "JOHN DOE\n123 Main St, City\nPhone: (123) 456-7890\nEmail: john.doe@email.com\n\nEXPERIENCE\nSenior Developer, ABC Corp (2020-Present)"),
    ("resume", "JANE SMITH - CURRICULUM VITAE\nEducation: Master of Computer Science, University X\nSkills: Python, Java, SQL, Machine Learning"),
    ("resume", "RESUME\nMICHAEL BROWN\nObjective: Seeking software engineering position\nProjects: Built document classification system using Python"),
    ("resume", "ALICE JOHNSON\nPROFESSIONAL SUMMARY\nData Scientist with 5+ years experience\nTechnical Skills: Python, R, TensorFlow, PyTorch"),
    ("resume", "DAVID WILSON - CV\nWORK HISTORY\nSenior Analyst, Data Insights Co. (2018-2023)\nEDUCATION\nPhD in Computer Science"),
    ("resume", "SARAH MILLER\nSKILLS\nProject Management\nData Analysis\nTeam Leadership\nCERTIFICATIONS\nPMP, AWS Certified"),
    
    # ID Documents
    ("id", "UNITED STATES PASSPORT\nPassport No: 123456789\nName: JOHN DOE\nDate of Birth: 01/01/1980\nExpiry Date: 01/01/2030"),
    ("id", "DRIVER LICENSE\nState: California\nDL Number: B1234567\nName: JANE SMITH\nDOB: 05/15/1985"),
    ("id", "NATIONAL ID CARD\nID Number: 987654321\nName: ROBERT JOHNSON\nIssued: 2020-01-01\nValid Until: 2030-01-01"),
    ("id", "UNITED KINGDOM PASSPORT\nPassport No: GB12345678\nSurname: WILLIAMS\nGiven Names: EMILY\nDate of Birth: 12/08/1990"),
    ("id", "EMPLOYEE ID CARD\nCompany: Tech Solutions Inc.\nID: TS-789456\nName: MICHAEL CHEN\nDepartment: Engineering"),
    ("id", "STUDENT ID\nUniversity: State University\nID: SU-2023-456\nName: JESSICA TAYLOR\nValid Thru: 06/2025"),
    
    # Contracts
    ("contract", "SERVICE AGREEMENT\nThis Agreement is made on January 15, 2023 between ABC Corp (Client) and XYZ Services (Provider)."),
    ("contract", "EMPLOYMENT CONTRACT\nBetween: Company Inc.\nAnd: Employee Name\nPosition: Software Developer\nStart Date: 01/02/2023"),
    ("contract", "CONFIDENTIALITY AGREEMENT\nThis Nondisclosure Agreement (the 'Agreement') is entered into by and between\nDisclosing Party: Innovate Tech\nReceiving Party: Consultant"),
    ("contract", "LEASE AGREEMENT\nThis Lease Agreement ('Lease') is made and entered into this 1st day of March, 2023\nBetween: Landlord\nAnd: Tenant"),
    
    # Reports
    ("report", "ANNUAL FINANCIAL REPORT 2022\nPrepared for: Shareholders\nPeriod: January 1 - December 31, 2022\nRevenue: $10M"),
    ("report", "MARKET ANALYSIS REPORT\nIndustry: Technology\nAuthor: Analytics Team\nDate: January 2023\nKey Findings: Market growing at 5% annually"),
    ("report", "PROJECT STATUS REPORT\nProject: Document Classification System\nDate: March 15, 2023\nStatus: Development Phase 2\nNext Steps: Model Optimization"),
    ("report", "MEDICAL LAB REPORT\nPatient: John Smith\nDate: 02/28/2023\nTest: Complete Blood Count\nResults: Within normal limits")
]

# Convert to DataFrame
df = pd.DataFrame(data, columns=['label', 'text'])
print("\nClass distribution:")
print(df['label'].value_counts())

def preprocess_text(text):
    """Enhanced text preprocessing with fallback for POS tagging"""
    try:
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        
        # Replace currency and numbers with tokens
        text = re.sub(r'\$\d+\.?\d*', '[CURRENCY]', text)
        text = re.sub(r'\d+', '[NUMBER]', text)
        
        # Clean special characters but keep important punctuation
        text = re.sub(r'[^\w\s.,;:!?\'"-]', ' ', text)
        text = text.lower()
        
        # Tokenize
        words = re.findall(r'\b[\w-]+\b', text)
        
        # Remove stopwords (keep some negations)
        try:
            stop_words = set(stopwords.words('english')) - {'not', 'no', 'nor'}
        except:
            # Fallback if stopwords not available
            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
        
        words = [word for word in words if word not in stop_words and len(word) > 2]
        
        # Lemmatization with POS tagging fallback
        lemmatizer = WordNetLemmatizer()
        try:
            # Try POS tagging
            pos_tags = nltk.pos_tag(words)
            words = [
                lemmatizer.lemmatize(word, 'v' if tag.startswith('V') else 'n')
                for word, tag in pos_tags
            ]
        except:
            # Fallback: simple lemmatization without POS
            words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    
    except Exception as e:
        print(f"Error in preprocessing: {e}")
        # Minimal preprocessing fallback
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        words = text.split()
        return ' '.join([word for word in words if len(word) > 2])

# Apply preprocessing
print("\nPreprocessing text...")
df['cleaned_text'] = df['text'].apply(preprocess_text)

print("\nSample before and after cleaning:")
print("Original:", df['text'][0][:100], "...")
print("Cleaned:", df['cleaned_text'][0][:100], "...")

# Smart Text Vectorization
print("\nCreating feature vectors...")
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 3),
    min_df=1,  # Reduced for small dataset
    max_df=0.9,
    stop_words='english',
    sublinear_tf=True,
    analyzer='word',
    token_pattern=r'\b[\w-]+\b'
)

X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

print(f"\nFeature matrix shape: {X.shape}")
print("Top 10 features:", vectorizer.get_feature_names_out()[:10])

# Model Training with Cross-Validation
print("\nTraining classifier...")
classifier = LinearSVC(
    C=0.8,
    class_weight='balanced',
    max_iter=20000,
    random_state=42,
    dual=False
)

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(classifier, X, y, cv=skf, scoring='accuracy')

print(f"\nCross-validation results:")
for i, score in enumerate(cv_scores):
    print(f"Fold {i+1} accuracy: {score:.3f}")
print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Final model training
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("\n" + "="*50)
print("FINAL MODEL EVALUATION")
print("="*50)

print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.3f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Feature importance analysis
print("\n" + "="*50)
print("TOP FEATURES BY CLASS")
print("="*50)

feature_names = vectorizer.get_feature_names_out()
classes = classifier.classes_

for i, class_name in enumerate(classes):
    print(f"\nTop features for '{class_name}':")
    # Get coefficients for this class
    if len(classes) == 2:
        coef = classifier.coef_[0] if i == 1 else -classifier.coef_[0]
    else:
        coef = classifier.coef_[i]
    
    # Get top features
    top_indices = coef.argsort()[-10:][::-1]
    top_features = [(feature_names[idx], coef[idx]) for idx in top_indices]
    
    for feature, score in top_features:
        print(f"  {feature}: {score:.3f}")

# Save the model and vectorizer
print("\n" + "="*50)
print("SAVING MODEL")
print("="*50)

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save vectorizer and classifier
joblib.dump(vectorizer, 'models/vectorizer.pkl')
joblib.dump(classifier, 'models/classifier.pkl')

print("Model and vectorizer saved to 'models/' directory")

# Function to classify new documents
def classify_document(text, vectorizer_path='models/vectorizer.pkl', classifier_path='models/classifier.pkl'):
    """
    Classify a new document
    """
    try:
        # Load models
        vectorizer = joblib.load(vectorizer_path)
        classifier = joblib.load(classifier_path)
        
        # Preprocess text
        cleaned_text = preprocess_text(text)
        
        # Vectorize
        text_vector = vectorizer.transform([cleaned_text])
        
        # Predict
        prediction = classifier.predict(text_vector)[0]
        probabilities = classifier.decision_function(text_vector)[0]
        
        # Get confidence scores
        confidence_scores = {}
        for i, class_name in enumerate(classifier.classes_):
            if len(classifier.classes_) == 2:
                # Binary classification
                conf = probabilities if i == 1 else -probabilities
            else:
                # Multi-class classification
                conf = probabilities[i]
            confidence_scores[class_name] = conf
        
        return {
            'prediction': prediction,
            'confidence_scores': confidence_scores,
            'preprocessed_text': cleaned_text
        }
    
    except Exception as e:
        return {'error': str(e)}

# Test the classification function
print("\n" + "="*50)
print("TESTING CLASSIFICATION FUNCTION")
print("="*50)

test_documents = [
    "Invoice #12345 from ABC Corp for $500.00 due on March 15, 2023",
    "John Smith, Software Engineer with 5 years experience in Python and Java",
    "Driver License ID: DL123456, John Doe, DOB: 01/01/1990",
    "This agreement is between Company A and Company B for software development services",
    "Quarterly sales report showing 15% growth in Q1 2023"
]

for i, doc in enumerate(test_documents):
    result = classify_document(doc)
    if 'error' not in result:
        print(f"\nTest Document {i+1}:")
        print(f"Text: {doc[:60]}...")
        print(f"Prediction: {result['prediction']}")
        print("Confidence scores:")
        for class_name, score in result['confidence_scores'].items():
            print(f"  {class_name}: {score:.3f}")
    else:
        print(f"Error classifying document {i+1}: {result['error']}")

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print("Files saved:")
print("- models/vectorizer.pkl")
print("- models/classifier.pkl")
print("\nUse classify_document() function to classify new documents.")

Downloading stopwords...
Downloading wordnet...
Downloading omw-1.4...
Downloading averaged_perceptron_tagger...
Setting up document classifier...

Class distribution:
label
invoice     6
resume      6
id          6
contract    4
report      4
Name: count, dtype: int64

Preprocessing text...

Sample before and after cleaning:
Original: Invoice No: INV-12345
Date: 2023-01-15
To: ABC Company
Description: Web Development Services
Total:  ...
Cleaned: invoice inv number date number number number abc company description web development service total c ...

Creating feature vectors...

Feature matrix shape: (26, 736)
Top 10 features: ['abc' 'abc company' 'abc company description' 'abc corp'
 'abc corp client' 'abc corp number' 'agreement' 'agreement agreement'
 'agreement agreement entered' 'agreement agreement january']

Training classifier...

Cross-validation results:
Fold 1 accuracy: 0.833
Fold 2 accuracy: 0.600
Fold 3 accuracy: 0.600
Fold 4 accuracy: 0.800
Fold 5 accuracy: 0.600
Mean CV