In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("smagnan/1-million-reddit-comments-from-40-subreddits")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/smagnan/1-million-reddit-comments-from-40-subreddits?dataset_version_number=1...


100%|██████████| 71.2M/71.2M [00:02<00:00, 27.4MB/s]

Extracting files...





Path to dataset files: /home/artemuna/.cache/kagglehub/datasets/smagnan/1-million-reddit-comments-from-40-subreddits/versions/1


In [5]:
import pandas as pd
import os

path = "/home/artemuna/.cache/kagglehub/datasets/smagnan/1-million-reddit-comments-from-40-subreddits/versions/1"

# 自动找到第一个 csv
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

df = pd.read_csv(os.path.join(path, csv_files[0]))

print(df.head())
print(df.shape)



       subreddit                                               body  \
0  gameofthrones  Your submission has been automatically removed...   
1            aww  Dont squeeze her with you massive hand, you me...   
2         gaming  It's pretty well known and it was a paid produ...   
3           news  You know we have laws against that currently c...   
4       politics  Yes, there is a difference between gentle supp...   

   controversiality  score  
0                 0      1  
1                 0     19  
2                 0      3  
3                 0     10  
4                 0      1  
(1000000, 4)


In [6]:
import re
import requests
from functools import lru_cache
from nltk.corpus import wordnet as wn

# ---------- 1. Basic person lexicon (you can extend) ----------

PERSON_PRONOUNS = {
    "i", "you", "he", "she", "we", "they",
    "me", "him", "her", "us", "them",
    "myself", "yourself", "himself", "herself",
    "ourselves", "themselves"
}

PERSON_LEXICON = {
    # generic human nouns
    "person", "people", "human", "humans", "individual", "individuals",
    "man", "men", "woman", "women",
    "guy", "guys", "girl", "girls", "boy", "boys",
    "child", "children", "kid", "kids",
    "adult", "adults", "teen", "teens", "teenager", "teenagers",

    # roles / occupations (frequent)
    "teacher", "doctor", "nurse", "driver", "customer",
    "worker", "employee", "employer",
    "student", "passenger", "client", "manager",
    "police", "officer", "chef", "waiter", "waitress",
    "farmer", "engineer", "scientist", "actor", "actress",
    "boss", "colleague", "partner", "friend", "buddy",

    # address terms
    "sir", "madam", "bro", "dude", "mate", "folks", "pal"
}


# ---------- 2. Helpers using WordNet ----------

def _wordnet_lexnames(noun: str):
    """Return a set of lexnames for the noun from WordNet."""
    synsets = wn.synsets(noun, pos=wn.NOUN)
    return {s.lexname() for s in synsets}


def _wordnet_is_person(noun: str) -> bool:
    """Check whether WordNet sees this noun as a person."""
    lexnames = _wordnet_lexnames(noun)
    if "noun.person" in lexnames:
        return True
    return False


def _wordnet_is_location(noun: str) -> bool:
    """Check whether WordNet treats this noun as a location."""
    lexnames = _wordnet_lexnames(noun)
    return "noun.location" in lexnames


def _wordnet_is_artifact_or_object(noun: str) -> bool:
    """Check whether WordNet treats this noun as an artifact / physical object."""
    lexnames = _wordnet_lexnames(noun)
    # artifact / object / food / animal / plant etc can be treated as non-person objects
    object_like = {
        "noun.artifact",
        "noun.object",
        "noun.food",
        "noun.animal",
        "noun.plant",
        "noun.body",
        "noun.substance"
    }
    return any(name in lexnames for name in object_like)


def _wordnet_is_group_org_like(noun: str) -> bool:
    """Check whether WordNet lexnames suggest group / org-like concepts."""
    lexnames = _wordnet_lexnames(noun)
    # noun.group often covers groups, organizations, teams, etc.
    return "noun.group" in lexnames


# ---------- 3. Helpers using ConceptNet (cached) ----------

CONCEPTNET_API = "https://api.conceptnet.io/query"

@lru_cache(maxsize=2048)
def _conceptnet_is_a_targets(noun: str):
    """
    Query ConceptNet for IsA relations and return a list of target labels.
    Cached to avoid repeated HTTP calls.
    """
    term = noun.replace(" ", "_")
    url = f"{CONCEPTNET_API}?node=/c/en/{term}&rel=/r/IsA&limit=20"
    try:
        resp = requests.get(url, timeout=2)
        if resp.status_code != 200:
            return []
        data = resp.json()
        labels = []
        for edge in data.get("edges", []):
            end = edge.get("end", {})
            label = end.get("label")
            if label:
                labels.append(label.lower())
        return labels
    except Exception:
        # Fail silently: if ConceptNet is not reachable, just return empty
        return []


def _conceptnet_class(noun: str):
    """
    Use ConceptNet IsA targets to infer coarse class:
    PERSON / OBJECT / LOCATION / ORG / None
    """
    labels = _conceptnet_is_a_targets(noun)
    if not labels:
        return None

    # Directly human-like
    human_keywords = {"person", "human", "man", "woman", "boy", "girl", "people"}
    if any(any(hk in lbl for hk in human_keywords) for lbl in labels):
        return "PERSON"

    # Location-like
    loc_keywords = {"place", "city", "country", "region", "location", "town", "village"}
    if any(any(lk in lbl for lk in loc_keywords) for lbl in labels):
        return "LOCATION"

    # Organization-like
    org_keywords = {"company", "organization", "institution", "university", "school", "team"}
    if any(any(ok in lbl for ok in org_keywords) for lbl in labels):
        return "ORG"

    # Object-like (device, tool, vehicle, furniture, food, etc.)
    obj_keywords = {
        "object", "artifact", "device", "tool", "vehicle", "furniture",
        "machine", "food", "drink", "instrument", "appliance"
    }
    if any(any(ok in lbl for ok in obj_keywords) for lbl in labels):
        return "OBJECT"

    return None


# ---------- 4. Main classifier (no LLM) ----------

def classify_noun_advanced(noun: str) -> str:
    """
    Classify a noun into:
        PERSON / OBJECT / LOCATION / ORG / OTHER

    Priority:
      1. Pronouns and explicit person lexicon
      2. WordNet (person / location / artifact / group)
      3. ConceptNet IsA
      4. Default: OBJECT (if it looks concrete), else OTHER
    """
    if not noun:
        return "OTHER"

    # normalize
    token = noun.strip().lower()
    if not token:
        return "OTHER"

    # 1) Pronouns
    if token in PERSON_PRONOUNS:
        return "PERSON"

    # 2) Person lexicon
    if token in PERSON_LEXICON:
        return "PERSON"

    # 3) WordNet-based decisions
    if _wordnet_is_person(token):
        return "PERSON"

    if _wordnet_is_location(token):
        return "LOCATION"

    if _wordnet_is_group_org_like(token):
        # You may want to distinguish ORG vs generic group, but treat as ORG here
        return "ORG"

    if _wordnet_is_artifact_or_object(token):
        # artifact / object / food / animal / body-part etc → treat as OBJECT (non-person)
        return "OBJECT"

    # 4) ConceptNet-based decisions (if available)
    cn_class = _conceptnet_class(token)
    if cn_class is not None:
        return cn_class

    # 5) Simple heuristic: if the word looks concrete (no obvious abstract suffix), call it OBJECT
    abstract_suffixes = ("-ism", "-ity", "-ness", "-tion", "-sion", "-ment", "-ship")
    if not any(token.endswith(suf) for suf in abstract_suffixes):
        return "OBJECT"

    # 6) Fallback
    return "OTHER"
