### Data Cleaning v01

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/variant_summary.txt', sep='\t')
df.head()

In [None]:
print(df.shape)
print(df['ClinicalSignificance'].value_counts())

In [None]:
# there are many ambiguous labels based on the ClinicalSignificance value counts
# resolution is to exclude those from the used data
labels_to_include = ['Pathogenic', 'Likely pathogenic', 'Benign', 'Likely benign']
df = df[df['ClinicalSignificance'].isin(labels_to_include)]

In [None]:
# focusing on simple nucleotide variant for now
df = df[df['Type'] == 'single nucleotide variant']

In [None]:
print(df['ReviewStatus'].value_counts())

# filter out reliable data (reviewed) data
high_quality = [
    'practice guideline',
    'reviewed by expert panel',
    'criteria provided, multiple submitters, no conflicts'
]
df = df[df['ReviewStatus'].isin(high_quality)]

In [None]:
# check for missing data in columns of interest
important_cols = ['GeneSymbol', 'Type', 'ClinicalSignificance', 'Assembly']
print(df[important_cols].isnull().sum())

In [None]:
# check genome builds
print(df['Assembly'].value_counts())

# pick one assembly to avoid duplicates
# GRCh38 is the current reference
df = df[df['Assembly'] == 'GRCh38']

In [None]:
# drop duplicate variants (no use to have them in there, no additional information is gained)
df = df.drop_duplicates(subset=['#AlleleID'], keep='first')

In [None]:
print(df['Chromosome'].value_counts())
# remove non-standard instances for the sake of simplicity
# keep 1-22, X, and Y
standard = [str(i) for i in range(1, 23)] + ['X', 'Y']
df = df[df['Chromosome'].isin(standard)]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
# filter out relevant features
features_to_keep = [
    'Type',
    'GeneSymbol',
    'Chromosome',
    'Start',
    'Stop',
]

y = df['ClinicalSignificance'].copy()
X = df[features_to_keep].copy()
print('Features:')
print(X.head())
print('\nTarget:')
print(y.head())