# nlpm Notebook
Environment: nlpm

In [21]:

import spacy

nlp = spacy.load("en_core_web_sm")

text = "Crappy socks Money wasted Bought to wear with my tieks Don’t stay on feet well"
doc = nlp(text)

pairs = []

for i in range(len(doc) - 1):
    if doc[i].pos_ == "ADJ" and doc[i+1].pos_ == "NOUN":
        pairs.append((doc[i].text, doc[i+1].text))

print("Adjective-Noun pairs:")
for adj, noun in pairs:
    print(f"{adj} -> {noun}")


Adjective-Noun pairs:
Crappy -> socks


In [24]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Input text
text = "I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty."

# Process text with spaCy pipeline
doc = nlp(text)

# Extract adjective-noun pairs (形容词修饰名词)
pairs = []
for token in doc:
    # Check if token is adjective and its dependency label is 'amod'
    if token.pos_ == "ADJ" and token.dep_ == "amod":
        head = token.head  # The noun modified by this adjective
        if head.pos_ == "NOUN":
            pairs.append((token.text, head.text))

# Print results
print("Adjective-Noun pairs:")
for adj, noun in pairs:
    print(f"{adj} -> {noun}")


Adjective-Noun pairs:
inside -> back
solid -> depression
silver -> depression
small -> photo
High -> quality


In [25]:
import spacy

# Load spaCy English model once globally (avoid reloading every call)
nlp = spacy.load("en_core_web_sm")

def extract_adj_noun_pairs(sentence):
    """
    Extract adjective-noun (amod) pairs from a sentence.
    Returns a list of tuples: [(adj, noun), ...]
    """
    doc = nlp(sentence)
    pairs = []

    for token in doc:
        # adjective modifying a noun (amod)
        if token.pos_ == "ADJ" and token.dep_ == "amod":
            head = token.head
            if head.pos_ == "NOUN":
                pairs.append((token.text, head.text))
    
    return pairs


In [26]:
text = "I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty."

pairs = extract_adj_noun_pairs(text)
print(pairs)


[('inside', 'back'), ('solid', 'depression'), ('silver', 'depression'), ('small', 'photo'), ('High', 'quality')]


## use less

In [23]:
import json

path = r"/mnt/d/acode/nlp/data/Electronics.jsonl"

records = []
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
    for i, line in enumerate(f):
        try:
            record = json.loads(line)
            records.append({
                "reviewText": record.get("reviewText", ""),
                "overall": record.get("overall", None)
            })
        except json.JSONDecodeError:
            continue

        # 控制内存：每 100,000 行保存一次
        if (i + 1) % 100000 == 0:
            print(f"Loaded {i+1} lines")
            # 可以在这里先处理或写入文件



Loaded 100000 lines
Loaded 200000 lines
Loaded 300000 lines
Loaded 400000 lines
Loaded 500000 lines
Loaded 600000 lines
Loaded 700000 lines
Loaded 800000 lines
Loaded 900000 lines
Loaded 1000000 lines
Loaded 1100000 lines
Loaded 1200000 lines
Loaded 1300000 lines
Loaded 1400000 lines
Loaded 1500000 lines
Loaded 1600000 lines
Loaded 1700000 lines
Loaded 1800000 lines
Loaded 1900000 lines
Loaded 2000000 lines
Loaded 2100000 lines
Loaded 2200000 lines
Loaded 2300000 lines
Loaded 2400000 lines
Loaded 2500000 lines
Loaded 2600000 lines
Loaded 2700000 lines
Loaded 2800000 lines
Loaded 2900000 lines
Loaded 3000000 lines
Loaded 3100000 lines
Loaded 3200000 lines
Loaded 3300000 lines
Loaded 3400000 lines
Loaded 3500000 lines
Loaded 3600000 lines
Loaded 3700000 lines
Loaded 3800000 lines
Loaded 3900000 lines
Loaded 4000000 lines
Loaded 4100000 lines
Loaded 4200000 lines
Loaded 4300000 lines
Loaded 4400000 lines
Loaded 4500000 lines
Loaded 4600000 lines
Loaded 4700000 lines
Loaded 4800000 lines
L

KeyboardInterrupt: 

In [None]:
import json

path = r"/mnt/d/acode/nlp/data/Electronics.jsonl"

SKIP = 10_000_000      # skip first 10 million lines
MAX_READ = 1_000_000   # read next 1 million lines

records = []
total_read = 0
non_empty = 0

with open(path, "r", encoding="utf-8", errors="ignore") as f:
    # Skip lines safely
    for _ in range(SKIP):
        line = next(f, None)
        if line is None:
            print("File ended before SKIP. No more data.")
            break
    print(f"Skipped {SKIP:,} lines.")

    # Read next chunk
    for i, line in enumerate(f):
        if i >= MAX_READ:
            break

        total_read += 1

        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue

        review = record.get("reviewText", None)
        overall = record.get("overall", None)

        # Allow empty reviews first, just count
        if review and review.strip():
            non_empty += 1

        records.append({
            "reviewText": review,
            "overall": overall
        })

        if (i + 1) % 100000 == 0:
            print(f"Read {i+1:,} lines. Non-empty so far: {non_empty:,}")

print(f"\nFinished: total lines read = {total_read:,}")
print(f"Non-empty reviewText = {non_empty:,}")
print(f"Records stored = {len(records):,}")




Skipped 10,000,000 lines.
Read 100,000 lines. Non-empty so far: 0
Read 200,000 lines. Non-empty so far: 0
Read 300,000 lines. Non-empty so far: 0
Read 400,000 lines. Non-empty so far: 0
Read 500,000 lines. Non-empty so far: 0
Read 600,000 lines. Non-empty so far: 0
Read 700,000 lines. Non-empty so far: 0
Read 800,000 lines. Non-empty so far: 0
Read 900,000 lines. Non-empty so far: 0
Read 1,000,000 lines. Non-empty so far: 0

Finished: total lines read = 1,000,000
Non-empty reviewText = 0
Records stored = 1,000,000


{'reviewText': None, 'overall': None}


In [None]:
import json

path = r"/mnt/d/acode/amz/beauty/1/meta_Amazon_Fashion.jsonl"

MAX_READ = 1_000_000
records = []
total_read = 0

with open(path, "r", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        if i >= MAX_READ:
            break

        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue

        records.append(record)
        total_read += 1

        if (i + 1) % 100000 == 0:
            print(f"Loaded {i+1:,} lines")

print(f"\nFinished reading {total_read:,} lines.")


Loaded 100,000 lines
Loaded 200,000 lines
Loaded 300,000 lines
Loaded 400,000 lines
Loaded 500,000 lines
Loaded 600,000 lines
Loaded 700,000 lines
Loaded 800,000 lines

Finished reading 826,108 lines.


In [None]:
if len(records) == 0:
    print("records is empty. No data loaded.")
else:
    print("Columns:", list(records[0].keys()))


Columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']


In [None]:
if len(records) > 0:
    print("\nFirst 5 records:")
    for r in records[:5]:
        print(r)



First 5 records:
{'main_category': 'AMAZON FASHION', 'title': "YUEDGE 5 Pairs Men's Moisture Control Cushioned Dry Fit Casual Athletic Crew Socks for Men (Blue, Size 9-12)", 'average_rating': 4.6, 'rating_number': 16, 'features': [], 'description': [], 'price': None, 'images': [{'thumb': 'https://m.media-amazon.com/images/I/41+cCfaVOFS._AC_SR38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41+cCfaVOFS._AC_.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/images/I/81XlFXImFrS._AC_UL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/41jBdP7etRS._AC_SR38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41jBdP7etRS._AC_.jpg', 'variant': 'PT01', 'hi_res': 'https://m.media-amazon.com/images/I/61+yVkHHQ3S._AC_UL1200_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/41UGJiRe7UL._AC_SR38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41UGJiRe7UL._AC_.jpg', 'variant': 'PT02', 'hi_res': 'https://m.media-amazon.com/images/I/61vbh6sLR1L._AC_UL1

## comments

In [None]:
import json

path = r"/mnt/d/acode/amz/beauty/Amazon_Fashion.jsonl"

MAX_READ = 1_000_000
records2 = []
total_read = 0

# ===== Read first 1 million lines =====
with open(path, "r", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        if i >= MAX_READ:
            break
        
        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue
        
        records2.append(record)
        total_read += 1
        
        if (i + 1) % 100000 == 0:
            print(f"Loaded {i+1:,} lines")

print(f"\nFinished reading {total_read:,} lines into records2.\n")

# ===== Print columns =====
if len(records2) == 0:
    print("records2 is empty.")
else:
    print("Columns:", list(records2[0].keys()))

# ===== Print first 5 rows =====
if len(records2) > 0:
    print("\nFirst 5 records:")
    for r in records2[:5]:
        print(r)


Loaded 100,000 lines
Loaded 200,000 lines
Loaded 300,000 lines
Loaded 400,000 lines
Loaded 500,000 lines
Loaded 600,000 lines
Loaded 700,000 lines
Loaded 800,000 lines
Loaded 900,000 lines
Loaded 1,000,000 lines

Finished reading 1,000,000 lines into records2.

Columns: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']

First 5 records:
{'rating': 5.0, 'title': 'Pretty locket', 'text': 'I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty.', 'images': [], 'asin': 'B00LOPVX74', 'parent_asin': 'B00LOPVX74', 'user_id': 'AGBFYI2DDIKXC5Y4FARTYDTQBMFQ', 'timestamp': 1578528394489, 'helpful_vote': 3, 'verified_purchase': True}
{'rating': 5.0, 'title': 'A', 'text': 'Great', 'images': [],

In [27]:
all_pairs = []   # 用来存 100 万条结果（顺序保留）

for idx, rec in enumerate(records2):
    sentence = rec.get("text", "")
    if not sentence:
        all_pairs.append([])  # 保持位置一致（空）
        continue
    
    pairs = extract_adj_noun_pairs(sentence)
    all_pairs.append(pairs)

    # 每 100k 行打印进度
    if (idx + 1) % 100000 == 0:
        print(f"Processed {idx+1:,} rows")


Processed 100,000 rows
Processed 200,000 rows
Processed 300,000 rows
Processed 400,000 rows
Processed 500,000 rows
Processed 600,000 rows
Processed 700,000 rows
Processed 800,000 rows
Processed 900,000 rows
Processed 1,000,000 rows


## frequency

In [None]:
all_pairs


In [60]:
from collections import Counter
import pandas as pd

def count_adj_from_all_pairs(all_pair):
    adjs = []

    for group in all_pair:           # 每条评论
        if not isinstance(group, (list, tuple)):
            continue
        
        for row in group:           # 每条评论中的每个 pair
            if not isinstance(row, (list, tuple)):
                continue
            if len(row) < 1:
                continue
            
            adj = row[0]            # 第一个是 adj
            
            # 过滤 None、空字符串
            if adj:
                adjs.append(adj)

    # 统计频率
    counter = Counter(adjs)

    df = pd.DataFrame(counter.items(), columns=["adj", "count"])
    df = df.sort_values(by="count", ascending=False).reset_index(drop=True)
    return df



In [None]:
df1 = count_adj_from_all_pairs(all_pairs)


     adj  count
0   good  48160
1  great  45992
2  Great  41623
3   nice  41506
4  other  31910


In [74]:
df1_over_100 = df1[df1["count"] > 100]
print(len(df1_over_100))


855


In [75]:
import spacy
from collections import defaultdict
import pandas as pd

# ---------------------------
# 1. Load spaCy model
# ---------------------------
nlp = spacy.load("en_core_web_sm")

# ---------------------------
# 2. noun 分类函数
# ---------------------------
def classify_noun(noun):
    """
    Classify noun as PERSON / PRODUCT / OTHER
    """
    doc = nlp(noun)

    # rule 1: pronoun = person
    if doc[0].pos_ == "PRON":
        return "PERSON"

    # rule 2: spaCy NER
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return "PERSON"
        elif ent.label_ in ["PRODUCT", "ORG", "WORK_OF_ART", "LAW", "EVENT"]:
            return "PRODUCT"

    # rule 3: default
    return "OTHER"

# ---------------------------
# 3. 清洗并扩展 pair 数据
# ---------------------------
def clean_and_expand_pairs_nested(pair_list):
    cleaned = []

    for group in pair_list:  
        # group 是一个评论的所有 pairs
        if not isinstance(group, (list, tuple)):
            continue
        
        for row in group:      # row 是单个 (adj, noun)
            if not isinstance(row, (list, tuple)):
                continue
            if len(row) < 2:
                continue
            
            adj = row[0]
            noun = row[1]

            if not adj or not noun:
                continue

            category = classify_noun(noun)
            cleaned.append([adj, noun, category])

    return cleaned


In [52]:
test = [
    [('Good', 'quality'), ('Great', 'quality')],
    [('small', 'photo')],
    [],
    [('solid','depression'), ('silver','depression')]
]

cleaned = clean_and_expand_pairs_nested(test)

for row in cleaned[:5]:
    print(row)



['Good', 'quality', 'OTHER']
['Great', 'quality', 'OTHER']
['small', 'photo', 'OTHER']
['solid', 'depression', 'OTHER']
['silver', 'depression', 'OTHER']


In [53]:

cleaned = clean_and_expand_pairs_nested(all_pairs)
for row in cleaned[:5]:
    print(row)


['inside', 'back', 'OTHER']
['solid', 'depression', 'OTHER']
['silver', 'depression', 'OTHER']
['small', 'photo', 'OTHER']
['High', 'quality', 'OTHER']


In [77]:
import pandas as pd
from collections import defaultdict

def adj_category_stats_filtered(cleaned, df1_over_100):
    # Step 1: 提取需要筛选的 adj 列表
    target_adjs = set(df1_over_100["adj"].tolist())   # faster lookup
    
    # Step 2: 用 defaultdict 构建统计字典
    stats = defaultdict(lambda: {"person": 0, "product": 0, "other": 0, "total": 0})

    # Step 3: 遍历 cleaned，筛选 adj 并统计类别
    for adj, noun, cat in cleaned:
        if adj not in target_adjs:
            continue
        
        stats[adj]["total"] += 1
        
        if cat == "PERSON":
            stats[adj]["person"] += 1
        elif cat == "PRODUCT":
            stats[adj]["product"] += 1
        else:
            stats[adj]["other"] += 1

    # Step 4: 转 DataFrame
    rows = []
    for adj, counts in stats.items():
        total = counts["total"]
        rows.append([
            adj,
            total,
            counts["person"] / total if total else 0,
            counts["product"] / total if total else 0,
            counts["other"] / total if total else 0,
            counts["person"],
            counts["product"],
            counts["other"]
        ])

    df = pd.DataFrame(rows, columns=[
        "adj", "total",
        "person_ratio", "product_ratio", "other_ratio",
        "person_count", "product_count", "other_count"
    ])

    # Step 5: 按 person_ratio 排序
    df = df.sort_values(by="person_ratio", ascending=False).reset_index(drop=True)

    return df



In [78]:
result_df = adj_category_stats_filtered(cleaned, df1_over_100)
print(result_df.head(20))



           adj  total  person_ratio  product_ratio  other_ratio  person_count  \
0          Muy    622      0.192926       0.191318     0.615756           120   
1       direct    222      0.135135       0.004505     0.860360            30   
2          muy    145      0.124138       0.158621     0.717241            18   
3       French    123      0.105691       0.008130     0.886179            13   
4        harsh    159      0.075472       0.012579     0.911950            12   
5        panty    173      0.075145       0.011561     0.913295            13   
6      intense    150      0.066667       0.006667     0.926667            10   
7       active    573      0.064572       0.003490     0.931937            37   
8      Ordered    110      0.054545       0.027273     0.918182             6   
9     vertical    186      0.053763       0.000000     0.946237            10   
10        avid    204      0.049020       0.039216     0.911765            10   
11     bifocal    178      0

In [79]:
import pandas as pd

df_cleaned = pd.DataFrame(cleaned, columns=["adj", "noun", "category"])


In [80]:
df_cleaned.to_csv("cleaned_pairs.csv", index=False, encoding="utf-8")
