In [1]:
import json
from collections import Counter

import pandas as pd
from thefuzz import fuzz
from openpyxl import load_workbook

In [2]:
# 定义数据文件的路径，使用了 f-string 来构建完整的文件路径。
dataset = "LUAD"
file_id = "0052ae83-7ae5-470a-a125-5cd94a9fa9e9"
file_name = "a6a6b9c6-9db7-42b3-a09f-770b7e126fbb.rna_seq.augmented_star_gene_counts.tsv"
sample_path = f"./TCGA_GDC/{dataset}/samples_info/{file_id}/{file_name}"

# 数据读取和预处理：
df = pd.read_csv(sample_path, sep="\t", skiprows=1)
df = df.iloc[4:, :]

# 剔除重复基因。删除含有 "_PAR_Y" 的基因（这些通常是 Y 染色体上的假常染色体区域的基因）。
df = df[~df["gene_id"].str.contains("_PAR_Y")]
df.reset_index(drop=True, inplace=True)

# 创建基因名称到 Ensembl ID 的映射：
tmp_dict = {gene_name: ensg_id for gene_name, ensg_id in zip(df["gene_name"], df["gene_id"])}
gene_ensg_id_dict = {k: tmp_dict[k].split(".")[0] for k in sorted(tmp_dict.keys())}

# 将处理后的字典保存为格式化的 JSON 文件：
json_str = json.dumps(gene_ensg_id_dict, ensure_ascii=False, indent=2)
with open("./data/gene_ensg_id_dict.json", "w") as f:
    f.write(json_str)

**这段 Python 代码的主要功能是处理一个包含基因信息的 JSON 文件，并检测重复的基因。**

1. 首先，使用`with open("./data/genes.2023-12-31.json", "r") as f:` 打开名为 "genes.2023-12-31.json" 的 JSON 文件，并将文件内容加载到变量 `data` 中。这个文件应该包含一个基因信息的列表，每个基因信息是一个字典。

2. 然后，使用 `print(f"Gene list length: {len(data)}")` 打印出基因列表的长度，即基因字典的数量。

3. 接下来，使用列表推导式 `[dic["symbol"] for dic in data]` 从 `data` 中提取出每个字典的 "symbol" 键对应的值，并将结果存储在变量 `genes` 中。

4. 使用 `print(Counter(genes).most_common()[:15])` 打印出出现次数最多的前 15 个基因及其出现次数。

5. 使用集合推导式 `{dic["symbol"] for dic in data}` 从 `data` 中提取出每个字典的 "symbol" 键对应的值，并将结果存储在变量 `genes` 中。这里使用了集合推导式，所以 `genes` 中的元素是唯一的，不会再有重复的基因。

6. 使用 `print(f"Gene set length: {len(genes)}")` 打印出基因集合的长度。由于集合中的元素是唯一的，所以如果原始数据中有重复的基因，那么这里的基因集合长度会小于基因列表的长度。

7. 使用 `[gene for gene in genes]` 将基因集合转换为列表，并将结果存储在变量 `genes` 中。

8. 使用 `genes.sort()` 对基因列表进行排序。

9. 最后，使用 `with open("./data/query_genes.txt", "w") as f:` 打开名为 "query_genes.txt" 的文件，并将基因列表中的每个基因写入文件中。如果基因是列表中的最后一个元素，那么只写入基因本身；否则，在基因后面添加一个换行符。


In [3]:
# 加载数据。TCGA 项目涉及的基因集共包含 20908 个基因。
with open("./data/genes.2023-12-31.json", "r") as f:
    data = json.load(f)

# 检测重复的基因
print(f"Gene list length: {len(data)}")             # 打印出 data 的长度，即基因字典的数量。
genes = [dic["symbol"] for dic in data]             # 使用列表推导式从 data 中提取出每个字典的 "symbol" 键对应的值
print(Counter(genes).most_common()[:15])

# 某些基因的染色体区段 "cytoband" 键对应的值是多个元素的列表（不只是一个），某些基因的字典中多了额外的一个键："is_cancer_gene_census"。
genes = {dic["symbol"] for dic in data}             # 使用集合推导式从 data 中提取出每个字典的 "symbol" 键对应的值
print(f"Gene set length: {len(genes)}")             # 20908 - 2 - 6 * 1 = 20900
genes = [gene for gene in genes]                    # 将 genes 集合转换为列表
genes.sort()                                        # 对 genes 列表进行排序

with open("./data/query_genes.txt", "w") as f:
    f.write("\n".join(genes))
    # for gene in genes:
    #     if gene == genes[-1]:
    #         f.write(gene)
    #     else:
    #         f.write(gene + "\n")

Gene list length: 20908
[('Y_RNA', 3), ('SFTA3', 2), ('POLR2J3', 2), ('SMIM40', 2), ('PDE11A', 2), ('ACTL10', 2), ('MATR3', 2), ('KCNK3', 1), ('KRTAP5-9', 1), ('SHC4', 1), ('NELL2', 1), ('ACACB', 1), ('RFWD3', 1), ('CALML5', 1), ('PCDH7', 1)]
Gene set length: 20900


**这段代码的主要功能是从一个文本文件和一个 JSON 文件中读取数据，然后生成一个新的字典，并将这个字典保存为一个 JSON 文件。**

1. 首先，使用 `with open("./data/TCGA_Genes.txt", "r") as f:` 打开名为 "TCGA_Genes.txt" 的文件，并将文件中的每一行（即每一个基因）读取出来，存储在集合 `tcga_genes` 中。这里使用了集合推导式，所以 `tcga_genes` 中的元素是唯一的，不会有重复的基因。

2. 然后，使用 `with open("./data/genes.2023-12-31.json", "r") as f:` 打开名为 "genes.2023-12-31.json" 的文件，并使用 `json.load(f)` 将文件中的数据加载出来，存储在变量 `data` 中。

3. 接下来，使用字典推导式 `{dic["symbol"]: dic["gene_id"] for dic in data if dic["symbol"] in tcga_genes}` 生成一个新的字典 `gene_to_ensg_id`，其中键是基因的符号，值是基因的 ID。这个字典只包含那些在 `tcga_genes` 中的基因。

4. 然后，定义了一个临时字典 `tmp_dict`，并使用 `gene_to_ensg_id.update(tmp_dict)` 将这个临时字典的内容更新到 `gene_to_ensg_id` 中。

5. 再次打开 "TCGA_Genes.txt" 文件，将文件中的每一行（即每一个基因）读取出来，存储在列表 `tcga_genes` 中。

6. 使用字典推导式 `{gene: gene_to_ensg_id[gene] for gene in tcga_genes}` 生成一个新的字典 `save_gene_to_ensg_id`，其中键是基因的名称，值是基因的 ID。

7. 最后，使用 `json.dumps(save_gene_to_ensg_id, ensure_ascii=False, indent=2)` 将 `save_gene_to_ensg_id` 转换为 JSON 格式的字符串，并使用 `with open("./data/gene_ensg_id_dict.json", "w") as f:` 将这个字符串写入到名为 "gene_ensg_id_dict.json" 的文件中。

In [4]:
# 读取 TCGA 基因列表：
with open("./data/TCGA_Genes.txt", "r") as f:
    tcga_genes = {gene for gene in f.read().split("\n")}

# print(tcga_genes[:10], len(tcga_genes))

# 加载数据。TCGA 项目涉及的基因集共包含 20908 个基因。
with open("./data/genes.2023-12-31.json", "r") as f:
    data = json.load(f)

# 构建基因到 Ensembl ID 的映射：
gene_to_ensg_id = {dic["symbol"]: dic["gene_id"] for dic in data if dic["symbol"] in tcga_genes}
# 人工更新一些基因名称及其对应的 Ensembl ID：
tmp_dict = {"SFTA3": "ENSG00000229415", "POLR2J3": "ENSG00000168255", "SMIM40": "ENSG00000286920",
            "PDE11A": "ENSG00000128655", "ACTL10": "ENSG00000288649", "MATR3": "ENSG00000015479"
           }
gene_to_ensg_id.update(tmp_dict)

with open("./data/TCGA_Genes.txt", "r") as f:
    tcga_genes = f.read().split("\n")
save_gene_to_ensg_id = {gene: gene_to_ensg_id[gene] for gene in tcga_genes}
# 保存映射字典：
json_str = json.dumps(save_gene_to_ensg_id, ensure_ascii=False, indent=2)
with open("./data/gene_ensg_id_dict.json", "w") as f:
    f.write(json_str)

In [5]:
with open("./data/genes_info.json", "r") as f:
    data = json.load(f)

gene_info_dict = {}
count_ = 0
for gene_id, info in data.items():
    if ".[" in info["summary_info"] or ". [" in info["summary_info"] or " [" in info["summary_info"]:
        continue
    # if ". [" not in info["summary_info"] and " [" in info["summary_info"]:
    #     count_ += 1
    #     print(info["summary_info"])
    elif "[provided" in info["summary_info"] or " provided " in info["summary_info"]:
        count_ += 1
        print(f"Gene ID: {gene_id}, Summary Info: {info['summary_info']}")
    elif "NaN" == info["summary_info"]:
        continue
    else:
        print("还有其他情况吗？")

    # 如果计数器 count_ 超过 10，则跳出循环，停止处理更多的基因信息。
    if count_ > 10:
        break

Gene ID: 43, Summary Info: Acetylcholinesterase hydrolyzes the neurotransmitter, acetylcholine at neuromuscular junctions and brain cholinergic synapses, and thus terminates signal transmission. It is also found on the red blood cell membranes, where it constitutes the Yt blood group antigen. Acetylcholinesterase exists in multiple molecular forms which possess similar catalytic properties, but differ in their oligomeric assembly and mode of cell attachment to the cell surface. It is encoded by the single ACHE gene, and the structural diversity in the gene products arises from alternative mRNA splicing, and post-translational associations of catalytic and structural subunits. The major form of acetylcholinesterase found in brain, muscle and other tissues is the hydrophilic species, which forms disulfide-linked oligomers with collagenous, or lipid-containing structural subunits. The other, alternatively spliced form, expressed primarily in the erythroid tissues, differs at the C-termina

In [6]:
def check_period(your_str):
    """确保字符串末尾有句号，如果没有则添加一个句号。"""
    if your_str[-1] == ".":
        return your_str
    else:
        return your_str + "."


# 定义摘要文本：
summary = """This gene encodes an enzyme that plays a role in the recovery of retinal photoreceptors from photobleaching. This enzyme promotes the activity of retinal guanylyl cyclase-1 (GC1) at low calcium concentrations and inhibits GC1 at high calcium concentrations. Mutations in this gene can cause cone dystrophy 3 and code-rod dystrophy 14. provided by RefSeq, Jul 2020]"""
# 查找特定字符串的位置：
last_index = summary.rfind(" provided ")
# # 截取字符串并调用 check_period 函数：
summary = check_period(summary[:last_index])
# 输出处理后的字符串：
print(summary)

This gene encodes an enzyme that plays a role in the recovery of retinal photoreceptors from photobleaching. This enzyme promotes the activity of retinal guanylyl cyclase-1 (GC1) at low calcium concentrations and inhibits GC1 at high calcium concentrations. Mutations in this gene can cause cone dystrophy 3 and code-rod dystrophy 14.


一句话总结：thefuzz 库是一个用于模糊字符串匹配的 Python 库，可以通过使用 fuzz 和 process 方法来进行模糊匹配和提取字符串。

要点：

- 使用 `pip install thefuzz` 命令安装 thefuzz 库。这个模糊匹配字符串工具通过 Levenshtein Distance 算法（也称为 Edit Distance 算法，来计算字符串之间的最小编辑操作次数。编辑操作包括字符替换、字符插入和字符删除，编辑距离越小，字符串相似度越大。）计算字符串之间的差异。

- 使用 `fuzz` 模块中的函数可以计算字符串之间的相似度，如 ratio()、partial_ratio() 和 token_sort_ratio()。这几个 ratio () 函数（方法）得到的结果都是数字，如果需要获得匹配度最高的字符串结果，仍需根据数据类型选择不同函数，并提取结果。

- 尽管可以使用此方法量化文本匹配程度，但提取匹配结果并不方便，因此引入了 process 模块。process 模块可以帮助我们在模糊字符串逻辑的帮助下有效地匹配或提取字符串。

- 旧版本的 thefuzz 库被称为 fuzzywuzzy，但现在已经不再维护和过时。

Github 地址：https://github.com/seatgeek/thefuzz

在 Python 中，匹配模糊的字符串：https://cloud.tencent.com/developer/article/2336720


使用 `thefuzz` 库进行简单的 Python 模糊字符串匹配测试。

In [7]:
summary = """The protein encoded by this gene belongs to the pancreatic ribonuclease family, a subset of the ribonuclease A superfamily. The protein exhibits antimicrobial activity against pathogenic bacteria [provided by RefSeq, Oct 2014]"""
new_summary = """The protein encoded by this gene belongs to the pancreatic ribonuclease family, a subset of the ribonuclease A superfamily. The protein exhibits antimicrobial activity against pathogenic bacteria."""
print(fuzz.partial_ratio(summary, new_summary))

summary = """This gene encodes a member of the glycosyltransferase family. The specific function of this protein has not been determined. Alternative splicing results in multiple transcript variants of this gene [provided by RefSeq, May 2013]"""
new_summary = """This gene encodes a member of the glycosyltransferase family. The specific function of this protein has not been determined. Alternative splicing results in multiple transcript variants of this gene."""
print(fuzz.partial_ratio(summary, new_summary))

100
100


要在 Python 中查找字符串最后一次出现位置，可以使用 `rfind ()` 方法。rfind() 方法从字符串末尾开始向前搜索，返回子串第一次出现的索引位置。如果子串不存在，rfind() 将返回 -1。

In [8]:
with open("./data/genes_info.json", "r") as f:
    data = json.load(f)

def check_period(your_str):   # 检查最后是否有英文句号
    if your_str[-1] == ".":
        return your_str
    else:
        return your_str + "."


def check_correctness(gene_id, summary, new_summary):
    reduce_words = len(summary.split(" ")) - len(new_summary.split(" "))
    if reduce_words > 8:
        print("-" * 88)
        print("FuzzyWuzzy：简单易用的字符串模糊匹配工具")
        print(f"Gene ID: {gene_id}")
        print(summary)
        print(new_summary)
        print(reduce_words)
        print("-" * 88)

    if fuzz.partial_ratio(summary, new_summary) < 99.9:
        print("-" * 88)
        print("FuzzyWuzzy：简单易用的字符串模糊匹配工具")
        print(f"Gene ID: {gene_id}")
        print(summary)
        print(new_summary)
        print(reduce_words)
        print("-" * 88)


gene_info_dict = {}
for gene_id, info in data.items():
    summary = info["summary_info"]
    # 字符串模式的校验：
    if ".[" in summary:
        last_index = summary.rfind(".[")
        new_summary = check_period(summary[:last_index])
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        check_correctness(gene_id, summary, new_summary)
        continue
    elif ". [" in summary:
        last_index = summary.rfind(". [")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif " [" in summary:
        last_index = summary.rfind(" [")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif "[provided" in summary:
        last_index = summary.rfind("[provided")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif " provided " in summary:
        last_index = summary.rfind(" provided ")
        new_summary = check_period(summary[:last_index])
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info
        continue
    elif summary == "NaN":
        new_summary = summary
        check_correctness(gene_id, summary, new_summary)
        info["summary_info"] = new_summary
        gene_info_dict[gene_id] = info

json_str = json.dumps(gene_info_dict, ensure_ascii=False, indent=2)
with open("genes_info_clean.json", "w") as f:
    f.write(json_str)

----------------------------------------------------------------------------------------
FuzzyWuzzy：简单易用的字符串模糊匹配工具
Gene ID: 53827
This gene encodes a member of a family of small membrane proteins that share a 35-amino acid signature sequence domain, beginning with the sequence PFXYD and containing 7 invariant and 6 highly conserved amino acids. The approved human gene nomenclature for the family is FXYD-domain containing ion transport regulator. Mouse FXYD5 has been termed RIC (Related to Ion Channel). FXYD2, also known as the gamma subunit of the Na,K-ATPase, regulates the properties of that enzyme. FXYD1 (phospholemman), FXYD2 (gamma), FXYD3 (MAT-8), FXYD4 (CHIF), and FXYD5 (RIC) have been shown to induce channel activity in experimental expression systems. Transmembrane topology has been established for two family members (FXYD1 and FXYD2), with the N-terminus extracellular and the C-terminus on the cytoplasmic side of the membrane. This gene product, FXYD5, is a glycoprotein that f

从一个 Excel 文件中读取数据，并提取特定列中的超链接，然后将这些超链接添加到原始数据框中作为新的一列。

In [9]:
# COSMIC - https://cancer.sanger.ac.uk/cosmic/download
# Expert Curation of Genes - https://cancer.sanger.ac.uk/cosmic/curation
df = pd.read_excel("./data/Expert_Curation_of_Genes.xlsx")
wb = load_workbook("./data/Expert_Curation_of_Genes.xlsx")
sheet = wb["Sheet1"]

hyperlinks = []
# 第 1 行是列名，从第 2 行开始有数据。
for row in range(2, len(df) + 2):
    # 第 1 列的单元格
    hyperlink = sheet.cell(row, 1).hyperlink.target
    hyperlinks.append(hyperlink)

df["Hyperlinks"] = hyperlinks
df

Unnamed: 0,Genes,Samples,Mutations,Papers,Hyperlinks
0,ABL1,40329,844,913,https://cancer.sanger.ac.uk/cosmic/gene/analys...
1,ACVR1,14329,124,183,https://cancer.sanger.ac.uk/cosmic/gene/analys...
2,ACVR1B,3787,58,166,https://cancer.sanger.ac.uk/cosmic/gene/analys...
3,ACVR2A,4905,332,234,https://cancer.sanger.ac.uk/cosmic/gene/analys...
4,AKT1,63618,1533,1238,https://cancer.sanger.ac.uk/cosmic/gene/analys...
...,...,...,...,...,...
267,VHL,38699,2984,903,https://cancer.sanger.ac.uk/cosmic/gene/analys...
268,WT1,39330,1493,784,https://cancer.sanger.ac.uk/cosmic/gene/analys...
269,XPO1,20593,351,338,https://cancer.sanger.ac.uk/cosmic/gene/analys...
270,ZFHX3,14801,725,346,https://cancer.sanger.ac.uk/cosmic/gene/analys...


In [10]:
df.to_excel("Expert_Curation_of_Genes.xlsx", index=False, encoding="utf-8")