In [10]:
import spacy

nlp = spacy.load("en_core_web_sm")

def classify_noun(noun):
    doc = nlp(noun)

    # 1. 代词直接归类为 PERSON
    if doc[0].pos_ == "PRON":
        return "PERSON"

    # 2. NER categories
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return "PERSON"
        elif ent.label_ in ["PRODUCT", "ORG", "WORK_OF_ART", "LAW", "EVENT"]:
            return "PRODUCT"
    
    # 3. 默认 OTHER
    return "OTHER"



In [11]:
tests = ["John", "Nike", "iPhone", "I", "me", "photo", "quality", "wallet"]

for t in tests:
    print(f"{t:10} → {classify_noun(t)}")


John       → PERSON
Nike       → PRODUCT
iPhone     → OTHER
I          → PERSON
me         → PERSON
photo      → OTHER
quality    → OTHER
wallet     → OTHER


In [None]:
from collections import Counter
import pandas as pd

def count_adj_frequency(annotated_list):
    adjs = [row[0] for row in annotated_list]
    counter = Counter(adjs)
    df = pd.DataFrame(counter.items(), columns=["adj", "count"])
    df = df.sort_values(by="count", ascending=False).reset_index(drop=True)
    return df


In [None]:
adj_freq_df = count_adj_frequency(annotated_list)
print(adj_freq_df.head(20))    # 前20个最常见 adj


In [None]:
filtered_adj = adj_freq_df[adj_freq_df["count"] > 50]
print(filtered_adj)


In [None]:
import pandas as pd
from collections import defaultdict

def adj_category_stats(annotated_list):
    # 结构：{ adj -> {category -> count} }
    stats = defaultdict(lambda: defaultdict(int))

    for adj, noun, cat in annotated_list:
        stats[adj][cat] += 1

    # 转换成 DataFrame
    rows = []
    for adj, cats in stats.items():
        total = sum(cats.values())
        person = cats.get("PERSON", 0)
        product = cats.get("PRODUCT", 0)
        other = cats.get("OTHER", 0)

        rows.append([
            adj,
            total,
            person / total if total else 0,
            product / total if total else 0,
            other / total if total else 0,
            person, product, other
        ])

    df = pd.DataFrame(rows, columns=[
        "adj", "total",
        "person_ratio", "product_ratio", "other_ratio",
        "person_count", "product_count", "other_count"
    ])

    # 按 “person 占比” 排序
    df = df.sort_values(by="person_ratio", ascending=False).reset_index(drop=True)
    return df


In [None]:
adj_stats_df = adj_category_stats(annotated_list)
print(adj_stats_df.head(20))   # person 占比最高的 adj
