In [1]:
from docx import Document
from docx.shared import RGBColor

def get_red_words(doc_path):
    doc = Document(doc_path)
    red_words = []
    for para in doc.paragraphs:
        for run in para.runs:
            if run.font.color.rgb == RGBColor(255, 0, 0):
                red_words.extend(run.text.split())
    return red_words

In [2]:
red_words = get_red_words("The Origins of Cetaceans.docx")
red_words

['cetaceans-whales', 'reconstruct', 'embed', 'exciting']

In [3]:
import re
import html
from urllib import parse
import requests

GOOGLE_TRANSLATE_URL = 'http://translate.google.com/m?q=%s&tl=%s&sl=%s'

def translate(text, to_language="zh-CN", text_language="en"):

    text = parse.quote(text)
    url = GOOGLE_TRANSLATE_URL % (text,to_language,text_language)
    response = requests.get(url)
    data = response.text
    expr = r'(?s)class="(?:t0|result-container)">(.*?)<'
    result = re.findall(expr, data)
    if (len(result) == 0):
        return ""

    return html.unescape(result[0])

In [4]:
# 用于存放翻译结果的字典
translations = {}

for word in red_words:
    # 调用 translate 方法进行翻译，源语言为英文('en')，目标语言为中文('zh-cn')
    translation = translate(word)

    # 将结果保存在字典中
    translations[word] = translation

print(translations)

{'cetaceans-whales': '鲸类-鲸鱼', 'reconstruct': '重建', 'embed': '嵌入', 'exciting': '令人兴奋的'}


In [5]:
# Define the function to add mapped string to red words
def add_mapped_string_to_red_words(paragraph):
    for run in paragraph.runs:
        if run.font.color and isinstance(run.font.color.rgb, RGBColor):
            color_value = run.font.color.rgb
            if color_value == RGBColor(255, 0, 0):  # Check if the color is red
                # Check if the word exists in the mapping dictionary
                if run.text.lower() in translations:
                    run.text = run.text + "(" + translations[run.text.lower()] + ")"

doc = Document("The Origins of Cetaceans.docx")
for paragraph in doc.paragraphs:
    add_mapped_string_to_red_words(paragraph)

In [6]:
doc.save("1.docx")

In [7]:
# 打开一个文件，如果不存在则创建
with open('output.md', 'w', encoding='utf-8') as f:
    # 写入markdown表格头
    f.write('| English | Chinese |\n')
    f.write('|---------|---------|\n')

    # 遍历字典，写入每一行内容
    for key, value in translations.items():
        f.write(f'| {key} | {value} |\n')

In [26]:
import pandas as pd

# Load the data from the Excel file
df = pd.read_excel("TOEFL.xlsx")

# Extract the words into a list
words_list = df.iloc[:, 0].str.lower().tolist()


In [27]:
df

Unnamed: 0,Word,Phonetic Transcription,Chinese
0,abandon,[ə'bændən],"vt. 放弃,沉溺n. 放任"
1,abashed,[ə'bæʃt],"adj. 1 (在人前) 感觉羞愧的,局促不安的,困窘的; 2. [因…]局促不安的"
2,abate,[ə'beit],"vt. 缓和,减弱,减少,废除vi. 缓和,减弱,减少"
3,abdicate,[æbdi'keit],vt. 放弃vi. 逊位
4,abduct,[æb'dʌkt],"vt. 诱拐,绑走"
...,...,...,...
4511,yield,[ji:ld],v. 出产; n. 产量，收益
4512,yogurt,['jɔgət],n. 酸奶(酪)
4513,zealous,['zeləs],a. 狂热的
4514,zinc,[ziŋk],n. 锌


In [24]:
df.dtype.fields

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [35]:
TOEFL = {}

# For each paragraph in the document
for para in doc.paragraphs:
    # For each run in the paragraph
    for run in para.runs:
        # Only consider runs that are not red
        if run.font.color.rgb != RGBColor(255,0,0):
            # Check each word in the run
            for word in run.text.split():
                # If the word (in lowercase) is in the list of words
                if word.lower() in words_list:
                    print(df[df.Word == word.lower()].Chinese.values)
                    TOEFL[word.lower()] = df[df.Word == word.lower()]['Chinese'].values[0]

TOEFL

['a. 明显的,明白的,显然的']
['n. 缺席; 缺乏，不存在']
['a. (成对的事物)后面的，在后的']
['v. n. 掩饰; 伪装']
['n. 陆地,地面,地界,土地,国土,地产vi. 登陆,登岸,到达vt. 使上岸,使']
['n. 住处']
['a. 功能的']
['n. 陆地,地面,地界,土地,国土,地产vi. 登陆,登岸,到达vt. 使上岸,使']
['a. 熄灭的,灭绝的,耗尽的']
['n. 舰队,水兵,海运业a. 海的,海产的,海运的,船舶的,海底的']
['n. 化石; a. 陈腐的; 化石的']
['n. 缝隙,缺口,间断vt. 打开缺口,造成缝隙vi. 豁开n. 间距n. 通用']
['n. 哺乳动物']
['ad. 最近']
['n. 陆地,地面,地界,土地,国土,地产vi. 登陆,登岸,到达vt. 使上岸,使']
['n. 哺乳动物']
['n. 化石; a. 陈腐的; 化石的']
['n. 化石; a. 陈腐的; 化石的']
['a. 古代的; 古老的']
['n. 化石; a. 陈腐的; 化石的']
['a. 完全的,熟练的,完成的vt. 完成,完工,使圆满']
['n. 颅骨，头脑']
['a. 熄灭的,灭绝的,耗尽的']
['n. 化石; a. 陈腐的; 化石的']
['a. 宝贵的,珍贵的,过于精致的,珍爱的']
['n. 颅骨，头脑']
['n. 空地; 空间; 太空; v. 隔开']
['v. 似乎; a. 可靠的; 健康的; 合理的']
['v. 似乎; a. 可靠的; 健康的; 合理的']
['n. 陆地,地面,地界,土地,国土,地产vi. 登陆,登岸,到达vt. 使上岸,使']
['n. 颅骨，头脑']
['n. 改编,适应,改编成的作品']
['a. 熄灭的,灭绝的,耗尽的']
['a. 浅的']
['n. 主修课,成年人,陆军少校a. 主要的,较多的,严重的,成年的vi. 主修']
['n. 沙漠a. 沙漠的,不毛的vt. 放弃,遗弃,逃跑vi. 逃掉n. 应得的赏']
['a. 完全的,熟练的,完成的vt. 完成,完工,使圆满']
['a. (成对的事物)后面的，在后的']
['a. 很少的,微小的']
['n. 舰队,水兵,海运业a. 海的,海产的,海运的,船舶的,海底的']
['a. (成对的事

{'obvious': 'a. 明显的,明白的,显然的',
 'absence': 'n. 缺席; 缺乏，不存在',
 'hind': 'a. (成对的事物)后面的，在后的',
 'disguise': 'v. n. 掩饰; 伪装',
 'land': 'n. 陆地,地面,地界,土地,国土,地产vi. 登陆,登岸,到达vt. 使上岸,使',
 'dwelling': 'n. 住处',
 'functional': 'a. 功能的',
 'extinct': 'a. 熄灭的,灭绝的,耗尽的',
 'marine': 'n. 舰队,水兵,海运业a. 海的,海产的,海运的,船舶的,海底的',
 'fossil': 'n. 化石; a. 陈腐的; 化石的',
 'gap': 'n. 缝隙,缺口,间断vt. 打开缺口,造成缝隙vi. 豁开n. 间距n. 通用',
 'mammal': 'n. 哺乳动物',
 'recently': 'ad. 最近',
 'mammals': 'n. 哺乳动物',
 'ancient': 'a. 古代的; 古老的',
 'complete': 'a. 完全的,熟练的,完成的vt. 完成,完工,使圆满',
 'skull': 'n. 颅骨，头脑',
 'precious': 'a. 宝贵的,珍贵的,过于精致的,珍爱的',
 'space': 'n. 空地; 空间; 太空; v. 隔开',
 'sound': 'v. 似乎; a. 可靠的; 健康的; 合理的',
 'adaptation': 'n. 改编,适应,改编成的作品',
 'shallow': 'a. 浅的',
 'major': 'n. 主修课,成年人,陆军少校a. 主要的,较多的,严重的,成年的vi. 主修',
 'desert': 'n. 沙漠a. 沙漠的,不毛的vt. 放弃,遗弃,逃跑vi. 逃掉n. 应得的赏',
 'tiny': 'a. 很少的,微小的',
 'even': 'a. 平坦的,偶数的,相等的,均匀的,连贯的,均等的,公平的,荹玫?平静的,',
 'portion': 'n. 部分,一份,命运,嫁妆vt. 将. . . 分配,分配,给. . . 嫁妆',
 'long': 'a. 长的,长久的,冗长的,做多头的vi. 渴望,热望,极想ad. 长久,始终n.',
 '

In [10]:
def extract_bold_sentences(docx_file):
    bold_sentences = []
    doc = Document(docx_file)

    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        if text:
            for run in paragraph.runs:
                if run.bold:
                    bold_sentences.append(text)
                    break

    return bold_sentences


In [11]:
docx_file = "The Origins of Cetaceans.docx"
bold_sentences = extract_bold_sentences(docx_file)

print("含有加粗单词的句子：")
for sentence in bold_sentences:
    print(sentence)

含有加粗单词的句子：
It should be obvious that cetaceans-whales, porpoises, and dolphins-are mammals. They breathe through lungs, not through gills, and give birth to live young. Their streamlined bodies, the absence of hind legs, and the presence of a fluke1 and blowhole2 cannot disguise their affinities with land dwelling mammals. However, unlike the cases of sea otters and pinnipeds (seals, sea lions, and walruses, whose limbs are functional both on land and at sea), it is not easy to envision what the first whales looked like. Extinct but already fully marine cetaceans are known from the fossil record. How was the gap between a walking mammal and a swimming whale bridged? Missing until recently were fossils clearly intermediate, or transitional, between land mammals and cetaceans.
The fossil consists of a complete skull of an archaeocyte, an extinct group of ancestors of modern cetaceans. Although limited to a skull, the Pakicetus fossil provides precious details on the origins of cetaceans.

In [23]:
# 打开一个文件，如果不存在则创建
with open('output.md', 'w', encoding='utf-8') as f:
    # 写入markdown表格头
    f.write('| English | Chinese |\n')
    f.write('|---------|---------|\n')

    # 遍历字典，写入每一行内容
    for key, value in TOEFL.items():
        f.write(f'| {key} | {value} |\n')