# nlpm Notebook
Environment: nlpm

In [1]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Input text
text = "The happy customer praised the friendly staff and the delicious food."

# Process text with spaCy pipeline
doc = nlp(text)

# Extract adjective-noun pairs (形容词修饰名词)
pairs = []
for token in doc:
    # Check if token is adjective and its dependency label is 'amod'
    if token.pos_ == "ADJ" and token.dep_ == "amod":
        head = token.head  # The noun modified by this adjective
        if head.pos_ == "NOUN":
            pairs.append((token.text, head.text))

# Print results
print("Adjective-Noun pairs:")
for adj, noun in pairs:
    print(f"{adj} -> {noun}")


Adjective-Noun pairs:
happy -> customer
friendly -> staff
delicious -> food


In [6]:
import json

path = r"/mnt/d/acode/nlp/data/Electronics.jsonl"

records = []
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
    for i, line in enumerate(f):
        try:
            record = json.loads(line)
            records.append({
                "reviewText": record.get("reviewText", ""),
                "overall": record.get("overall", None)
            })
        except json.JSONDecodeError:
            continue

        # 控制内存：每 100,000 行保存一次
        if (i + 1) % 100000 == 0:
            print(f"Loaded {i+1} lines")
            # 可以在这里先处理或写入文件



Loaded 100000 lines
Loaded 200000 lines
Loaded 300000 lines
Loaded 400000 lines
Loaded 500000 lines
Loaded 600000 lines
Loaded 700000 lines
Loaded 800000 lines
Loaded 900000 lines
Loaded 1000000 lines
Loaded 1100000 lines
Loaded 1200000 lines
Loaded 1300000 lines
Loaded 1400000 lines
Loaded 1500000 lines
Loaded 1600000 lines
Loaded 1700000 lines
Loaded 1800000 lines
Loaded 1900000 lines
Loaded 2000000 lines
Loaded 2100000 lines
Loaded 2200000 lines
Loaded 2300000 lines
Loaded 2400000 lines
Loaded 2500000 lines
Loaded 2600000 lines
Loaded 2700000 lines
Loaded 2800000 lines
Loaded 2900000 lines
Loaded 3000000 lines
Loaded 3100000 lines
Loaded 3200000 lines
Loaded 3300000 lines
Loaded 3400000 lines
Loaded 3500000 lines
Loaded 3600000 lines
Loaded 3700000 lines
Loaded 3800000 lines
Loaded 3900000 lines
Loaded 4000000 lines
Loaded 4100000 lines
Loaded 4200000 lines
Loaded 4300000 lines
Loaded 4400000 lines
Loaded 4500000 lines
Loaded 4600000 lines
Loaded 4700000 lines
Loaded 4800000 lines
L

KeyboardInterrupt: 