读取clinvar数据

In [None]:
import pandas as pd

file_path = r'D:\data\clinvar\variant_summary.txt\variant_summary.txt'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
df.columns

In [None]:
df['ClinicalSignificance'].value_counts()[:10]

In [None]:
pathogenicity_levels = ['Benign', 'Benign/Likely benign', 'Likely benign', 'Likely pathogenic', 'Pathogenic']
variants_by_pathogenicity = []
for patho in pathogenicity_levels:
    variants_by_pathogenicity.append(df['ClinicalSignificance'].value_counts()[patho])
variants_by_pathogenicity

In [None]:
df['Chromosome'].value_counts()

In [None]:
chrs = [str(i) for i in range(1, 23)]
variants_chr_counts = df['Chromosome'].value_counts()
chrs.extend(['X', 'Y', 'MT'])
variants_by_chr = []
for chr in chrs:
    print(chr)
    variants_by_chr.append(variants_chr_counts[chr])
variants_by_chr

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 3))
plt.subplot(121)
plt.bar(pathogenicity_levels, variants_by_pathogenicity)
plt.subplot(122)
plt.bar(chrs, variants_by_chr)
plt.savefig('../img/data_desc/raw_data_description.png')
plt.show()

In [None]:
df['Type'].value_counts()

In [None]:
import pandas as pd

file_path = r'D:\data\clinvar\variant_summary.txt\variant_summary.txt'
df = pd.read_csv(file_path, delimiter='\t')
df = df[df['Type']=='single nucleotide variant']
df = df[(df['ClinicalSignificance'] == 'Pathogenic') | (df['ClinicalSignificance'] == 'Benign')]
df = df[df['Assembly'] == 'GRCh38']
df = df[df['Chromosome']!='MT']
df['label'] = -1
df['label'][df['ClinicalSignificance'] == 'Pathogenic'] = 1
df['label'][df['ClinicalSignificance'] != 'Pathogenic'] = 0
df = df[df['ReferenceAlleleVCF'] != 'na']
df.to_csv('../data/variants.csv', index=False)

获取突变位点周围序列

In [None]:
import pandas as pd

# 取突变位点周围50bp的内容
surrounding_lens = [50, 100, 300, 500, 1000]
df = pd.read_csv('../data/variants.csv')
for chr, subset in df.groupby('Chromosome'):
    print(chr)
    with open(r'../data/ucsc/hg38/chr{}.txt'.format(chr)) as f:
        seq = f.readline()
        for index, variant in subset.iterrows():
            point_index = int(variant['PositionVCF'])
            # point_index = seq[point_index-1]
            ref = variant['ReferenceAlleleVCF']
            alt = variant['AlternateAlleleVCF']
            for surrounding_len in surrounding_lens:
                left = seq[point_index-1-surrounding_len:point_index-1]
                right = seq[point_index: point_index+surrounding_len]
                # print(point_index, ref, alt, seq[point_index-1], left, right)
                if len(left)<surrounding_len:
                    left = 'N' *(surrounding_len-len(left)) + left
                if len(right)<surrounding_len:
                    right = right + 'N'*(surrounding_len-len(right))
                t1 = ''.join([left, ref, right])
                t2 = ''.join([left, alt, right])
                ref_col = 'seq_{}_ref'.format(surrounding_len)
                alt_col = 'seq_{}_alt'.format(surrounding_len)
                df.loc[index, ref_col] = t1.upper()
                df.loc[index, alt_col] = t1.upper()

In [None]:
df.to_csv('../data/variants_with_seq.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv("../data/variants_with_seq.csv")
df.columns

In [None]:
df_model = df[['Chromosome', 'ReferenceAlleleVCF', 'AlternateAlleleVCF', 'Start', 'Stop','label',
       'seq_50_ref', 'seq_50_alt', 'seq_100_ref', 'seq_100_alt', 'seq_300_ref',
       'seq_300_alt', 'seq_500_ref', 'seq_500_alt', 'seq_1000_ref',
       'seq_1000_alt']]

In [None]:
df_model.to_csv('../data/variants_model.csv', index=False)

In [None]:
df = df[df['ReferenceAlleleVCF'] != 'na']

In [None]:
df.to_csv('../data/variants_with_seq.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('../data/variants_with_seq.csv')

In [None]:
pathogenicity_levels = ['Benign', 'Pathogenic']
variants_by_pathogenicity = []
for patho in pathogenicity_levels:
    print(patho)
    variants_by_pathogenicity.append(df['ClinicalSignificance'].value_counts()[patho])
variants_by_pathogenicity

In [None]:
chrs = [str(i) for i in range(1, 23)]
variants_chr_counts = df['Chromosome'].value_counts()
chrs.extend(['X', 'Y'])
variants_by_chr = []
for chr in chrs:
    variants_by_chr.append(variants_chr_counts[chr])
variants_by_chr

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 3))
plt.subplot(121)
plt.bar(pathogenicity_levels, variants_by_pathogenicity)
plt.subplot(122)
plt.bar(chrs, variants_by_chr)
plt.savefig('../img/data_desc/fliterd_data_description.png')
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('../data/variants_model.csv')

In [None]:
df_positive = df[df['label'] == 1]
df_negative = df[df['label'] == 0]

In [None]:
print(df_positive.shape, df_negative.shape)

In [None]:
num_p = df_positive.shape[0]

In [None]:
num_n = df_negative.shape[0]

In [None]:
train_ratio = 0.7
valid_ration = 0.2
test_ration = 0.1

In [None]:
df_positive_train = df_positive.iloc[:int(num_p*train_ratio)]
df_positive_valid = df_positive.iloc[int(num_p*train_ratio):int(num_p*0.9)]
df_positive_test = df_positive.iloc[int(num_p*0.9):]

In [None]:
print(df_positive_train.shape, df_positive_valid.shape, df_positive_test.shape)

In [None]:
df_negative_train = df_negative.iloc[:int(num_n*train_ratio)]
df_negative_valid = df_negative.iloc[int(num_n*train_ratio):int(num_n*0.9)]
df_negative_test = df_negative.iloc[int(num_n*0.9):]

In [None]:
print(df_negative_train.shape, df_negative_valid.shape, df_negative_test.shape)

In [None]:
df_train = pd.concat([df_positive_train, df_negative_train])
df_valid = pd.concat([df_positive_valid, df_negative_valid])
df_test = pd.concat([df_positive_test, df_negative_test])

In [None]:
print(df_train.shape, df_valid.shape, df_test.shape)

In [None]:
df_train.to_csv('../data/train.csv', index=False)
df_valid.to_csv('../data/valid.csv', index=False)
df_test.to_csv('../data/test.csv', index=False)