### Import

In [1]:
import pandas as pd
import re

In [2]:
cardioDB = pd.read_csv("cardioDBwithREF.csv", delimiter = ",")
cardioDB.head()

Unnamed: 0.1,Unnamed: 0,Gene,Nucleotide.Change,Protein.Change,Consequence,OMGL.class,LMM.class,Phenotype,Type,Location.GRCh37.,correct_ref
0,0,MYH7,c.4048G>A,p.E1350K,missense,VUS,,Dilated Cardiomyopathy,substitution,chr14:23887540,C
1,1,MYH7,c.2555T>C,p.M852T,missense,Likely Pathogenic,VUS favour pathogenic,Hypertrophic Cardiomyopathy,substitution,chr14:23894102,A
2,3,MYH7,c.2401T>A,p.Y801N,missense,,VUS,Hypertrophic Cardiomyopathy,substitution,chr14:23894513,A
3,4,MYBPC3,c.2905+1G>A,,essential splice site,Pathogenic,Pathogenic,Hypertrophic Cardiomyopathy,substitution,chr11:47356592,C
4,5,PLN,c.152T>C,p.L51P,missense,,VUS favour pathogenic,Hypertrophic Cardiomyopathy,substitution,chr6:118880236,T


### Dropping columns that are not needed

In [3]:
cardioDB.columns

Index(['Unnamed: 0', 'Gene', 'Nucleotide.Change', 'Protein.Change',
       'Consequence', 'OMGL.class', 'LMM.class', 'Phenotype', 'Type',
       'Location.GRCh37.', 'correct_ref'],
      dtype='object')

In [4]:
cardioDB = cardioDB.drop(columns=['Unnamed: 0','Gene', 'Protein.Change', 'Consequence','Type'])

In [5]:
cardioDB.head()

Unnamed: 0,Nucleotide.Change,OMGL.class,LMM.class,Phenotype,Location.GRCh37.,correct_ref
0,c.4048G>A,VUS,,Dilated Cardiomyopathy,chr14:23887540,C
1,c.2555T>C,Likely Pathogenic,VUS favour pathogenic,Hypertrophic Cardiomyopathy,chr14:23894102,A
2,c.2401T>A,,VUS,Hypertrophic Cardiomyopathy,chr14:23894513,A
3,c.2905+1G>A,Pathogenic,Pathogenic,Hypertrophic Cardiomyopathy,chr11:47356592,C
4,c.152T>C,,VUS favour pathogenic,Hypertrophic Cardiomyopathy,chr6:118880236,T


### Basic info

In [6]:
cardioDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216 entries, 0 to 1215
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Nucleotide.Change  1216 non-null   object
 1   OMGL.class         679 non-null    object
 2   LMM.class          712 non-null    object
 3   Phenotype          1216 non-null   object
 4   Location.GRCh37.   1216 non-null   object
 5   correct_ref        1214 non-null   object
dtypes: object(6)
memory usage: 57.1+ KB


### Checking missing values

In [7]:
cardioDB.isna().sum()

Nucleotide.Change      0
OMGL.class           537
LMM.class            504
Phenotype              0
Location.GRCh37.       0
correct_ref            2
dtype: int64

In [8]:
#cardioDB[cardioDB["OMGL.class"].isna()].head()
cardioDB[cardioDB["LMM.class"].isna()].head()

Unnamed: 0,Nucleotide.Change,OMGL.class,LMM.class,Phenotype,Location.GRCh37.,correct_ref
0,c.4048G>A,VUS,,Dilated Cardiomyopathy,chr14:23887540,C
5,c.573G>A,VUS,,Hypertrophic Cardiomyopathy,chr19:55663262,C
7,c.136C>T,Likely Pathogenic,,Arrhythmogenic Right Ventricular Cardiomyopathy,chr18:29099820,C
8,c.4132G>C,VUS,,Hypertrophic Cardiomyopathy,chr14:23887456,C
9,c.929-2A>G,Pathogenic,,Hypertrophic Cardiomyopathy,chrX:119575751,T


### Exploring column data

In [9]:
# How often do values appear

cardioDB["Nucleotide.Change"].value_counts()
#cardioDB["OMGL.class"].value_counts()
#cardioDB["LMM.class"].value_counts()
#cardioDB["Phenotype"].value_counts()
#cardioDB["Location.GRCh37."].value_counts()
#cardioDB["correct_ref"].value_counts()

c.3133C>T      3
c.532G>A       3
c.4048G>A      2
c.613C>T       2
c.1123G>A      2
              ..
c.1224-2A>G    1
c.1477A>T      1
c.1324C>T      1
c.5401G>A      1
c.579G>C       1
Name: Nucleotide.Change, Length: 1154, dtype: int64

**Observation:** *OMGL.class* has 3 different values, *LMM.class* has different 6 values, *Phenotype* has 3 different values, *correct_ref* has 5 different values (CGTA.) and the other columns have hundreds of different values. There are several variants that share the same nucleotide change, this needs to be considered.

## Investigating column *Location.GRCh37.*

In [10]:
len(cardioDB["Location.GRCh37."]) - cardioDB["Location.GRCh37."].str.count("^chr[0-9X]+:[0-9]+$").sum()

12

In [11]:
len(cardioDB["Location.GRCh37."]) - cardioDB["Location.GRCh37."].str.count("^chr[0-9X]+:[0-9]+-*[0-9]*$").sum()

0

**Observation:** The column *Location.GRCh37.* has not only entries in the format *chr10:88476172* but also 12 entries in the format that contains a range like *chr10:88476172-88476195*. This needs to be considered, but all entries fulfill the regex format "^chr[0-9X]+:[0-9]+-\*[0-9]*$".

## Investigating column *Nucleotide.Change*

In [12]:
cardioDB["Nucleotide.Change"].head()

0      c.4048G>A
1      c.2555T>C
2      c.2401T>A
3    c.2905+1G>A
4       c.152T>C
Name: Nucleotide.Change, dtype: object

In [13]:
for i in range(len(cardioDB["Nucleotide.Change"])):
    if not re.search("^c\.[*-]*[0-9]+[+-_]*[0-9]*[ACGT]>[ACGT]+$", cardioDB["Nucleotide.Change"][i]):
        print(i, cardioDB["Nucleotide.Change"][i])

86 c.1320_1343del
159 c.1320_1343dup
372 c.8481_8492del
556 c.3742_3759dup
561 c.3767_3769delCCA
578 c.993_994insT
784 c.1111_1125del
920 c.2627_2629delAGA
938 c.2623_2625delGAG
945 c.333_334insT
1020 c.3332_3335dupAGTG
1074 c.2528_2536delAGATGCGCG


**Oberservation**: There are not only substitution changes in the variation dataset. 

# Cleaning data

In [14]:
# Dropping rows that do not contain a substitution
error_rows = []
for i in range(len(cardioDB["Nucleotide.Change"])):
    if not re.search("^c\.[*-]*[0-9]+[+-_]*[0-9]*[ACGT]>[ACGT]+$", cardioDB["Nucleotide.Change"][i]):
        error_rows.append(i)

cardioDB = cardioDB.drop(error_rows)

# Splitting the values of column Location.GRCh37.
cardioDB[['Location.GRCh37.chromosome','Location.GRCh37.position']] = cardioDB['Location.GRCh37.'].str.split(':', expand=True)

# Splitting the values of column Nucleotide.Change
cardioDB[['Nucleotide.Change.reference', 'Nucleotide.Change.variation']] = cardioDB['Nucleotide.Change'].str.split('>', expand=True)

## Investigating double variants

In [15]:
variation_only = cardioDB[['Location.GRCh37.','correct_ref', 'Nucleotide.Change.variation']]
len(variation_only.value_counts()[variation_only.value_counts()> 1])

28

**Observation**: There are 28 double variants, that need to be deleted or changed.

In [16]:
cardioDB = cardioDB.drop_duplicates(subset=['Location.GRCh37.','correct_ref', 'Nucleotide.Change.variation'])