## Tutorial: Text Classification Using SpaCy


In [25]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import json


## sample annotation view

```json
{
  "annotations": {
    "acad_1992": {
      "0": {
        "classification": "figurative",
        "context": "that works well for some young people ( and their teachers ) will frustrate others . There is no one best system of education . <p> Is n't this inconsistent with \" effective school \" research ? Not at all . Effective",
        "full_text": "Section : Special Section : School Choice <p> As interest in school choice increases , eight major m...",
        "match": "best system",
        "notes": "",
        "text_id": 514,
        "timestamp": "2025-11-15T00:25:32.056184"
      }
    },
    "acad_1993": {
      "1": {
        "classification": "literal",
        "context": "to basic structural damage . Those who are contented in the healthcare arena typically say , \" We have the best system in the world ; who would want to change it ? \" What , then , are their underlying assumptions",
        "full_text": "Section : In Context HEALTHCARE &; PUBLIC POLICY Upsetting the Contended <p> \" They live . . . with ...",
        "match": "best system",
        "notes": "",
        "text_id": 808,
        "timestamp": "2025-11-15T00:30:23.613668"
      }
    },
```

In [45]:
path = "./kwic_coca_best_system_annotations_export.json"
#path = "./kwic_coca_industry_standard_annotations_export.json"
#path = "./kwic_coca_national_system_annotations_export.json"


In [46]:
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Flatten the annotations JSON structure into a DataFrame
rows = []

# The data has 'annotations' as the top-level key
annotations = data.get('annotations', {})

for genre_id, items_dict in annotations.items():
    # items_dict is a dictionary where keys are string indices
    for idx_str, item in items_dict.items():
        row = {
            'genre_id': genre_id,
            'genre_idx': int(idx_str),
            'text_id': item.get('text_id'),
            'match': item.get('match'),
            'context': item.get('context'),
            'full_text': item.get('full_text'),
            'classification': item.get('classification'),
            'notes': item.get('notes'),
            'timestamp': item.get('timestamp')
        }
        rows.append(row)

# Create the flattened DataFrame
df_flat = pd.DataFrame(rows)

# Sort by genre_id and genre_idx for better readability
df_flat = df_flat.sort_values(['genre_id', 'genre_idx']).reset_index(drop=True)

print(f"Flattened dataframe shape: {df_flat.shape}")
print(f"Columns: {list(df_flat.columns)}")
df_flat.head(5)

Flattened dataframe shape: (147, 9)
Columns: ['genre_id', 'genre_idx', 'text_id', 'match', 'context', 'full_text', 'classification', 'notes', 'timestamp']


Unnamed: 0,genre_id,genre_idx,text_id,match,context,full_text,classification,notes,timestamp
0,acad_1992,0,514,best system,that works well for some young people ( and th...,Section : Special Section : School Choice <p> ...,figurative,,2025-11-15T00:25:32.056184
1,acad_1993,1,808,best system,to basic structural damage . Those who are con...,Section : In Context HEALTHCARE &; PUBLIC POLI...,literal,,2025-11-15T00:30:23.613668
2,acad_1995,0,691,best system,broadening the graduate experience and reducin...,Section : PHYSICS COMMUNITY <p> As reports of ...,literal,,2025-11-15T00:40:24.443102
3,acad_1999,0,821,best system,a warrior class to defend it against its enemi...,"In this article , I will examine the continuit...",figurative,,2025-11-15T00:25:39.350971
4,acad_2003,0,333,best system,"and benefits , are difficult to maintain and n...",CLARIFYING COMMISSION 'S STANCE <p> I AM WRITI...,literal,,2025-11-15T00:40:16.640799


In [47]:
df_flat.classification.value_counts()

classification
figurative    69
literal       53
unclear       16
neither        9
Name: count, dtype: int64

### Clustering Analysis

We'll cluster the contexts to see if natural groupings emerge that align with the classification labels (literal vs figurative).

In [48]:
import string
import re
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# Create our list of punctuation marks as a set for faster lookup
punctuations = set(string.punctuation)

# Load English tokenizer with full NLP pipeline for lemmatization
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# HTML tag stripper
def strip_html_tags(text):
    """Remove HTML tags from text"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    """Tokenize, lemmatize, lowercase, and remove punctuation. Keep stop words."""
    # Strip HTML tags first
    sentence = strip_html_tags(sentence)
    
    # Use nlp (not parser) for full lemmatization
    mytokens = nlp(sentence)
    
    # Lemmatize and lowercase
    mytokens = [word.lemma_.lower() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    
    # Remove punctuation and whitespace-only tokens
    mytokens = [word for word in mytokens if word not in punctuations and word.strip() != '']
    
    return mytokens

In [49]:
# Vectorize the context text using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    tokenizer=spacy_tokenizer,
    max_features=500,  # Limit features for cleaner clustering
    min_df=1,          # Include terms that appear in at least 1 document
    max_df=0.80        # Ignore terms that appear in more than 95% of documents
)

# Fit and transform the context column
X_tfidf = tfidf_vectorizer.fit_transform(df_flat['context'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Number of features (terms): {len(tfidf_vectorizer.get_feature_names_out())}")



TF-IDF matrix shape: (147, 500)
Number of features (terms): 500


In [50]:
# Perform K-Means clustering
from sklearn.cluster import KMeans

# Try clustering into 2 groups (to match literal vs figurative)
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_flat['cluster'] = kmeans.fit_predict(X_tfidf)

print(f"Clustering complete with {n_clusters} clusters")
print("\nCluster distribution:")
print(df_flat['cluster'].value_counts().sort_index())

Clustering complete with 2 clusters

Cluster distribution:
cluster
0    77
1    70
Name: count, dtype: int64


In [51]:
# Test the fixed tokenizer
sample_context = df_flat['context'].iloc[0]
print("Sample context:")
print(sample_context[:150])
print("\nTokenized:")
tokens = spacy_tokenizer(sample_context)
print(f"Token count: {len(tokens)}")
print(tokens[:25])

Sample context:
that works well for some young people ( and their teachers ) will frustrate others . There is no one best system of education . <p> Is n't this incons

Tokenized:
Token count: 33
['that', 'work', 'well', 'for', 'some', 'young', 'people', 'and', 'their', 'teacher', 'will', 'frustrate', 'other', 'there', 'be', 'no', 'one', 'good', 'system', 'of', 'education', 'be', 'not', 'this', 'inconsistent']


In [52]:
# Show sample contexts from each cluster with their classifications
for cluster_id in range(n_clusters):
    print(f"\n{'='*80}")
    print(f"CLUSTER {cluster_id} - Sample contexts")
    print('='*80)
    
    cluster_samples = df_flat[df_flat['cluster'] == cluster_id].head(5)
    
    for idx, row in cluster_samples.iterrows():
        print(f"\nClassification: {row['classification']}")
        print(f"Context: {row['context'][:200]}...")
        print(f"Match: {row['match']}")
        print("-" * 80)


CLUSTER 0 - Sample contexts

Classification: figurative
Context: that works well for some young people ( and their teachers ) will frustrate others . There is no one best system of education . <p> Is n't this inconsistent with " effective school " research ? Not at...
Match: best system
--------------------------------------------------------------------------------

Classification: literal
Context: broadening the graduate experience and reducing the time to degree , Lane said , but the US still has the best system for producing scientists and " too rapid a change would be a mistake . " <p> One c...
Match: best system
--------------------------------------------------------------------------------

Classification: figurative
Context: a warrior class to defend it against its enemies . A Utopia , with what Plato thinks to be the best system of citizen training , will still be confronted by those who would seek to destroy it . Hence ...
Match: best system
--------------------------------

### Examine Sample Contexts from Each Cluster

In [53]:
# Get top terms for each cluster to understand what defines them
def get_top_terms_per_cluster(n_terms=10):
    """Extract the most important terms for each cluster"""
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names_out()
    
    for i in range(n_clusters):
        print(f"\nCluster {i} top terms:")
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        print(", ".join(top_terms))

get_top_terms_per_cluster(15)


Cluster 0 top terms:
of, a, and, to, in, for, one, have, that, 's, with, on, world, it, education

Cluster 1 top terms:
it, i, to, we, not, that, have, but, do, in, he, and, you, for, they


In [54]:
# Compare clusters with classifications
comparison = pd.crosstab(df_flat['cluster'], df_flat['classification'], margins=True)
print("Cluster vs Classification Crosstab:")
print(comparison)

# Calculate how well clusters align with classifications
from sklearn.metrics import adjusted_rand_score, silhouette_score

# Convert classifications to numeric for comparison
classification_map = {label: idx for idx, label in enumerate(df_flat['classification'].unique())}
y_true = df_flat['classification'].map(classification_map)

ari = adjusted_rand_score(y_true, df_flat['cluster'])
silhouette = silhouette_score(X_tfidf, df_flat['cluster'])

print(f"\nAdjusted Rand Index: {ari:.3f} (1.0 = perfect match, 0 = random)")
print(f"Silhouette Score: {silhouette:.3f} (closer to 1 = better defined clusters)")

Cluster vs Classification Crosstab:
classification  figurative  literal  neither  unclear  All
cluster                                                   
0                       38       22        8        9   77
1                       31       31        1        7   70
All                     69       53        9       16  147

Adjusted Rand Index: 0.007 (1.0 = perfect match, 0 = random)
Silhouette Score: 0.013 (closer to 1 = better defined clusters)
