#### 将频数指标删除，改为比例指标，导出新的csv文件

In [7]:
#v3: 28变量版
import pandas as pd

# 使用字典映射修改列标题
new_column_names = {
    'ratio_unique_words': 'UW/W',
    'ratio_repeats': 'RP/W',
    'mean_num_clauses_per_sentence': 'C/S',
    'mean_num_t_units_per_sentence': 'T/S',
    'mean_num_verb_phrases_per_t_unit': 'VP/T',
    'mean_num_clauses_per_t_unit': 'C/T',
    'mean_length_sentences': 'MLS',
    'mean_length_clauses': 'MLC',
    'mean_length_t_units': 'MLT',
    'prop_noun_phrase': 'PNP',
    'prop_verb_phrase': 'PVP',
    'prop_adj_phrase': 'PAP',
    'prop_adv_phrase': 'PAdvP',
    'prop_prep_phrase': 'PPP',
    'mean_depth': 'MDN',
    'ratio_adv_mods': 'advmod',
    'ratio_adj_mods': 'amod',
    'ratio_nom_mods': 'nmod',
    'ratio_nom_subjs': 'nsubj',
    'ratio_cases': 'prep',
    'ratio_compounds': 'cmpd',
    'mean_dep_dis': 'MDD',
    'mean_hier_dis': 'MHD',
    'ratio_head_init_deps': 'HI/W',
    'ratio_head_final_deps': 'HF/W'
}

# 假设你已经将数据加载为pandas数据框
# 例如，从CSV文件读取数据
files = [
    'D:/Desktop/features_郊游风景图.csv',
    'D:/Desktop/features_三毛流浪记.csv',
    'D:/Desktop/features_牛郎织女.csv'
]

for file in files:
    data = pd.read_csv(file)

    # 补充3个指标
    S_W = data['num_sentences'] / data['num_words']
    C_W = data['num_clauses'] / data['num_words']
    T_W = (data['num_clauses'] / data['mean_num_clauses_per_t_unit']) / data['num_words']
    
    # 将新列放入 DataFrame
    new_metrics = pd.DataFrame({'S/W': S_W, 'C/W': C_W, 'T/W': T_W})

    # 删除 No 列和其他无用列
    data = data.drop(columns=['No', 'num_sentences', 'num_clauses', 'num_words'])

    # 找到 date_subject 列的索引
    subject_index = data.columns.get_loc('date_subject') + 1  # 找到 date_subject 列索引，+1表示在之后插入

    # 将新列插入到 data_subject 列之后
    data = pd.concat([data.iloc[:, :subject_index], 
                      new_metrics,  # 插入新列
                      data.iloc[:, subject_index:]], axis=1)

    # 修改原列名
    data = data.rename(columns=new_column_names)

    # 导出新的 CSV 文件
    output_file = file.replace('.csv', '_v3.csv')  # 生成新的文件名
    data.to_csv(output_file, index=False)  # index=False 防止导出时添加行号


In [1]:
#v4: 去掉HF/W
import pandas as pd

# 使用字典映射修改列标题
new_column_names = {
    'ratio_unique_words': 'UW/W',
    'ratio_repeats': 'RP/W',
    'mean_num_clauses_per_sentence': 'C/S',
    'mean_num_t_units_per_sentence': 'T/S',
    'mean_num_verb_phrases_per_t_unit': 'VP/T',
    'mean_num_clauses_per_t_unit': 'C/T',
    'mean_length_sentences': 'MLS',
    'mean_length_clauses': 'MLC',
    'mean_length_t_units': 'MLT',
    'prop_noun_phrase': 'PNP',
    'prop_verb_phrase': 'PVP',
    'prop_adj_phrase': 'PAP',
    'prop_adv_phrase': 'PAdvP',
    'prop_prep_phrase': 'PPP',
    'mean_depth': 'MDN',
    'ratio_adv_mods': 'advmod',
    'ratio_adj_mods': 'amod',
    'ratio_nom_mods': 'nmod',
    'ratio_nom_subjs': 'nsubj',
    'ratio_cases': 'prep',
    'ratio_compounds': 'cmpd',
    'mean_dep_dis': 'MDD',
    'mean_hier_dis': 'MHD',
    'ratio_head_init_deps': 'HI/W'
}

# 假设你已经将数据加载为pandas数据框
# 例如，从CSV文件读取数据
files = [
    'D:/Desktop/features_郊游风景图.csv',
    'D:/Desktop/features_三毛流浪记.csv',
    'D:/Desktop/features_牛郎织女.csv'
]

for file in files:
    data = pd.read_csv(file)

    # 补充3个指标
    S_W = data['num_sentences'] / data['num_words']
    C_W = data['num_clauses'] / data['num_words']
    T_W = (data['num_clauses'] / data['mean_num_clauses_per_t_unit']) / data['num_words']
    
    # 将新列放入 DataFrame
    new_metrics = pd.DataFrame({'S/W': S_W, 'C/W': C_W, 'T/W': T_W})

    # 删除 No 列和其他无用列
    data = data.drop(columns=['No', 'num_sentences', 'num_clauses', 'num_words', 'ratio_head_final_deps'])

    # 找到 date_subject 列的索引
    subject_index = data.columns.get_loc('date_subject') + 1  # 找到 date_subject 列索引，+1表示在之后插入

    # 将新列插入到 data_subject 列之后
    data = pd.concat([data.iloc[:, :subject_index], 
                      new_metrics,  # 插入新列
                      data.iloc[:, subject_index:]], axis=1)

    # 修改原列名
    data = data.rename(columns=new_column_names)

    # 导出新的 CSV 文件
    output_file = file.replace('.csv', '_v4.csv')  # 生成新的文件名
    data.to_csv(output_file, index=False)  # index=False 防止导出时添加行号


In [None]:
#v5: 去掉HI/W
import pandas as pd

# 使用字典映射修改列标题
new_column_names = {
    'ratio_unique_words': 'UW/W',
    'ratio_repeats': 'RP/W',
    'mean_num_clauses_per_sentence': 'C/S',
    'mean_num_t_units_per_sentence': 'T/S',
    'mean_num_verb_phrases_per_t_unit': 'VP/T',
    'mean_num_clauses_per_t_unit': 'C/T',
    'mean_length_sentences': 'MLS',
    'mean_length_clauses': 'MLC',
    'mean_length_t_units': 'MLT',
    'prop_noun_phrase': 'PNP',
    'prop_verb_phrase': 'PVP',
    'prop_adj_phrase': 'PAP',
    'prop_adv_phrase': 'PAdvP',
    'prop_prep_phrase': 'PPP',
    'mean_depth': 'MDN',
    'ratio_adv_mods': 'advmod',
    'ratio_adj_mods': 'amod',
    'ratio_nom_mods': 'nmod',
    'ratio_nom_subjs': 'nsubj',
    'ratio_cases': 'prep',
    'ratio_compounds': 'cmpd',
    'mean_dep_dis': 'MDD',
    'mean_hier_dis': 'MHD',
    'ratio_head_final_deps': 'HF/W'
}

# 假设你已经将数据加载为pandas数据框
# 例如，从CSV文件读取数据
files = [
    'D:/Desktop/features_郊游风景图.csv',
    'D:/Desktop/features_三毛流浪记.csv',
    'D:/Desktop/features_牛郎织女.csv'
]

for file in files:
    data = pd.read_csv(file)

    # 补充3个指标
    S_W = data['num_sentences'] / data['num_words']
    C_W = data['num_clauses'] / data['num_words']
    T_W = (data['num_clauses'] / data['mean_num_clauses_per_t_unit']) / data['num_words']
    
    # 将新列放入 DataFrame
    new_metrics = pd.DataFrame({'S/W': S_W, 'C/W': C_W, 'T/W': T_W})

    # 删除 No 列和其他无用列
    data = data.drop(columns=['No', 'num_sentences', 'num_clauses', 'num_words', 'ratio_head_init_deps'])

    # 找到 date_subject 列的索引
    subject_index = data.columns.get_loc('date_subject') + 1  # 找到 date_subject 列索引，+1表示在之后插入

    # 将新列插入到 data_subject 列之后
    data = pd.concat([data.iloc[:, :subject_index], 
                      new_metrics,  # 插入新列
                      data.iloc[:, subject_index:]], axis=1)

    # 修改原列名
    data = data.rename(columns=new_column_names)

    # 导出新的 CSV 文件
    output_file = file.replace('.csv', '_v5.csv')  # 生成新的文件名
    data.to_csv(output_file, index=False)  # index=False 防止导出时添加行号


In [23]:
#v6: 修改HI/W和HF/W的值
import pandas as pd

# 使用字典映射修改列标题
new_column_names = {
    'ratio_unique_words': 'UW/W',
    'ratio_repeats': 'RP/W',
    'mean_num_clauses_per_sentence': 'C/S',
    'mean_num_t_units_per_sentence': 'T/S',
    'mean_num_verb_phrases_per_t_unit': 'VP/T',
    'mean_num_clauses_per_t_unit': 'C/T',
    'mean_length_sentences': 'MLS',
    'mean_length_clauses': 'MLC',
    'mean_length_t_units': 'MLT',
    'prop_noun_phrase': 'PNP',
    'prop_verb_phrase': 'PVP',
    'prop_adj_phrase': 'PAP',
    'prop_adv_phrase': 'PAdvP',
    'prop_prep_phrase': 'PPP',
    'mean_depth': 'MDN',
    'ratio_adv_mods': 'advmod',
    'ratio_adj_mods': 'amod',
    'ratio_nom_mods': 'nmod',
    'ratio_nom_subjs': 'nsubj',
    'ratio_cases': 'prep',
    'ratio_compounds': 'cmpd',
    'mean_dep_dis': 'MDD',
    'mean_hier_dis': 'MHD',
    'ratio_head_init_deps': 'HI/W',
    'ratio_head_final_deps': 'HF/W'
}

# 假设你已经将数据加载为pandas数据框
# 例如，从CSV文件读取数据
files = [
    'D:/Desktop/features_郊游风景图.csv',
    'D:/Desktop/features_三毛流浪记.csv',
    'D:/Desktop/features_牛郎织女.csv'
]

for file in files:
    data = pd.read_csv(file)

    # 补充3个指标
    S_W = data['num_sentences'] / data['num_words']
    C_W = data['num_clauses'] / data['num_words']
    T_W = (data['num_clauses'] / data['mean_num_clauses_per_t_unit']) / data['num_words']
    
    num_dep = data['num_words'] - data['num_sentences']
    HI = round(data['ratio_head_init_deps'] * num_dep).astype(int)
    HF = round(data['ratio_head_final_deps'] * num_dep).astype(int)

    HI = HI - data['num_sentences']
    data['ratio_head_init_deps'] = HI/num_dep
    data['ratio_head_final_deps'] = HF/num_dep
    
    print(data[['ratio_head_init_deps', 'ratio_head_final_deps']])
    
    # 将新列放入 DataFrame
    new_metrics = pd.DataFrame({'S/W': S_W, 'C/W': C_W, 'T/W': T_W})

    # 删除 No 列和其他无用列
    data = data.drop(columns=['No', 'num_sentences', 'num_clauses', 'num_words'])

    # 找到 date_subject 列的索引
    subject_index = data.columns.get_loc('date_subject') + 1  # 找到 date_subject 列索引，+1表示在之后插入

    # 将新列插入到 data_subject 列之后
    data = pd.concat([data.iloc[:, :subject_index], 
                      new_metrics,  # 插入新列
                      data.iloc[:, subject_index:]], axis=1)

    # 修改原列名
    data = data.rename(columns=new_column_names)

    # 导出新的 CSV 文件
    output_file = file.replace('.csv', '_v6.csv')  # 生成新的文件名
    data.to_csv(output_file, index=False)  # index=False 防止导出时添加行号


    ratio_head_init_deps  ratio_head_final_deps
0               0.447154               0.552846
1               0.518987               0.481013
2               0.357488               0.642512
3               0.461538               0.538462
4               0.342857               0.657143
5               0.315508               0.684492
6               0.428571               0.571429
7               0.476440               0.523560
8               0.435000               0.565000
9               0.454545               0.545455
10              0.338583               0.661417
11              0.374640               0.625360
12              0.390625               0.609375
13              0.369427               0.630573
14              0.500000               0.500000
15              0.440000               0.560000
16              0.378378               0.621622
17              0.392308               0.607692
18              0.500000               0.500000
19              0.422535               0

In [24]:
#v7: 修改HI/W和HF/W的值，保证正确，并删掉HI/W，只保留HF/W
import pandas as pd

# 使用字典映射修改列标题
new_column_names = {
    'ratio_unique_words': 'UW/W',
    'ratio_repeats': 'RP/W',
    'mean_num_clauses_per_sentence': 'C/S',
    'mean_num_t_units_per_sentence': 'T/S',
    'mean_num_verb_phrases_per_t_unit': 'VP/T',
    'mean_num_clauses_per_t_unit': 'C/T',
    'mean_length_sentences': 'MLS',
    'mean_length_clauses': 'MLC',
    'mean_length_t_units': 'MLT',
    'prop_noun_phrase': 'PNP',
    'prop_verb_phrase': 'PVP',
    'prop_adj_phrase': 'PAP',
    'prop_adv_phrase': 'PAdvP',
    'prop_prep_phrase': 'PPP',
    'mean_depth': 'MDN',
    'ratio_adv_mods': 'advmod',
    'ratio_adj_mods': 'amod',
    'ratio_nom_mods': 'nmod',
    'ratio_nom_subjs': 'nsubj',
    'ratio_cases': 'prep',
    'ratio_compounds': 'cmpd',
    'mean_dep_dis': 'MDD',
    'mean_hier_dis': 'MHD',
    'ratio_head_init_deps': 'HI/W',
    'ratio_head_final_deps': 'HF/W'
}

# 假设你已经将数据加载为pandas数据框
# 例如，从CSV文件读取数据
files = [
    'D:/Desktop/features_郊游风景图.csv',
    'D:/Desktop/features_三毛流浪记.csv',
    'D:/Desktop/features_牛郎织女.csv'
]

for file in files:
    data = pd.read_csv(file)

    # 补充3个指标
    S_W = data['num_sentences'] / data['num_words']
    C_W = data['num_clauses'] / data['num_words']
    T_W = (data['num_clauses'] / data['mean_num_clauses_per_t_unit']) / data['num_words']
    
    num_dep = data['num_words'] - data['num_sentences']
    HI = round(data['ratio_head_init_deps'] * num_dep).astype(int)
    HF = round(data['ratio_head_final_deps'] * num_dep).astype(int)

    HI = HI - data['num_sentences']
    data['ratio_head_init_deps'] = HI/num_dep
    data['ratio_head_final_deps'] = HF/num_dep
    
    print(data[['ratio_head_init_deps', 'ratio_head_final_deps']])
    
    # 将新列放入 DataFrame
    new_metrics = pd.DataFrame({'S/W': S_W, 'C/W': C_W, 'T/W': T_W})

    # 删除 No 列和其他无用列
    data = data.drop(columns=['No', 'num_sentences', 'num_clauses', 'num_words', 'ratio_head_init_deps'])

    # 找到 date_subject 列的索引
    subject_index = data.columns.get_loc('date_subject') + 1  # 找到 date_subject 列索引，+1表示在之后插入

    # 将新列插入到 data_subject 列之后
    data = pd.concat([data.iloc[:, :subject_index], 
                      new_metrics,  # 插入新列
                      data.iloc[:, subject_index:]], axis=1)

    # 修改原列名
    data = data.rename(columns=new_column_names)

    # 导出新的 CSV 文件
    output_file = file.replace('.csv', '_v7.csv')  # 生成新的文件名
    data.to_csv(output_file, index=False)  # index=False 防止导出时添加行号


    ratio_head_init_deps  ratio_head_final_deps
0               0.447154               0.552846
1               0.518987               0.481013
2               0.357488               0.642512
3               0.461538               0.538462
4               0.342857               0.657143
5               0.315508               0.684492
6               0.428571               0.571429
7               0.476440               0.523560
8               0.435000               0.565000
9               0.454545               0.545455
10              0.338583               0.661417
11              0.374640               0.625360
12              0.390625               0.609375
13              0.369427               0.630573
14              0.500000               0.500000
15              0.440000               0.560000
16              0.378378               0.621622
17              0.392308               0.607692
18              0.500000               0.500000
19              0.422535               0