In [None]:
import pandas as pd, stanza, wordfreq, re, math, tqdm

In [None]:
# 1. read
df = pd.read_csv("bulwer.tsv", sep='\t')

In [None]:
# 2. pipline
nlp = stanza.Pipeline(
        lang='en',
        processors='tokenize,pos,lemma,depparse,constituency',
        tokenize_no_ssplit=False,
        use_gpu=True
)

In [None]:
# 3. distance
def dep_distance(sent):
    return sum(abs(w.id - w.head) for w in sent.words[1:]) / (len(sent.words)-1)

def tree_depth(const_node):
    if not const_node.children:
        return 1
    return 1 + max(tree_depth(child) for child in const_node.children)

In [None]:
# 4. Feature
records = []
for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    doc   = nlp(row['text'])
    toks  = [w.text for s in doc.sentences for w in s.words]
    lemmas= [w.lemma for s in doc.sentences for w in s.words]
    pos   = [w.upos  for s in doc.sentences for w in s.words]

# syntactic complexity
sent_lens = [len(s.words) for s in doc.sentences]
    mean_len  = sum(sent_lens)/len(sent_lens)
    cv_len    = (pd.Series(sent_lens).std()/mean_len) if len(sent_lens)>1 else 0
    dep_dist  = sum(dep_distance(s) for s in doc.sentences)/len(doc.sentences)
    depth     = sum(tree_depth(s.constituency) for s in doc.sentences)/len(doc.sentences)
    sub_ratio = sum(1 for p in pos if p in ['SCONJ'])/len(toks)  # 粗略示例
    conj_ratio= pos.count('CCONJ')/len(toks)

# lexical creativity
ttr      = len(set(lemmas))/len(toks)
    hapax    = sum(1 for t in set(lemmas) if lemmas.count(t)==1)/len(toks)
    avg_len  = sum(len(t) for t in toks)/len(toks)
    adj_adv  = sum(1 for p in pos if p in ['ADJ','ADV'])/len(toks)
    oov      = sum(1 for t in toks if wordfreq.zipf_frequency(t.lower(),'en')<2)/len(toks)

    like_sim = len(re.findall(r'\blike a\b|\blike an\b', row['text'].lower()))/len(doc.sentences)

    records.append({
        'genre':   row['genre'],
        'mean_sent_len': mean_len,
        'cv_sent_len'  : cv_len,
        'dep_dist'     : dep_dist,
        'tree_depth'   : depth,
        'sub_ratio'    : sub_ratio,
        'conj_ratio'   : conj_ratio,
        'ttr'          : ttr,
        'hapax'        : hapax,
        'avg_token_len': avg_len,
        'adj_adv_ratio': adj_adv,
        'oov_ratio'    : oov,
        'simile_rate'  : like_sim
    })

feat_df = pd.DataFrame(records)


In [None]:
# 5. Genre
# feature
df["syntactic_complexity"] = df["text"].apply(get_syntactic_complexity)
df["lexical_creativity"] = df["text"].apply(get_lexical_creativity)

# genre
complexity_by_genre = df.groupby("genre")["syntactic_complexity"].apply(lambda x: pd.DataFrame(x.tolist()).mean())
creativity_by_genre = df.groupby("genre")["lexical_creativity"].apply(lambda x: pd.DataFrame(x.tolist()).mean())

print("Syntactic Complexity by Genre:")
print(complexity_by_genre)
print("\nLexical Creativity by Genre:")
print(creativity_by_genre)

In [None]:
# 6. visualization
import seaborn as sns
import matplotlib.pyplot as plt

# 示例：可视化依存深度
sns.boxplot(data=df, x="genre", y="syntactic_complexity.dependency_depth")
plt.xticks(rotation=45)
plt.title("Dependency Depth by Genre")
plt.show()
