This notebook is the first step of prediction pipeline. It will convert the input data into the representation suitable for the model prediction task.

Template for attributes / features:

https://drive.google.com/file/d/1zJHIA_zAdgbdzRv8iB5GbU7pBynJH4Hk/view?usp=sharing

Template for labels:

https://drive.google.com/file/d/1JBPjCurjDPYHGHdf161gnwgNNauCPrbk/view?usp=sharing

In [2]:
# !pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.3.0-py2.py3-none-any.whl (82 kB)
[?25l[K     |████                            | 10 kB 23.2 MB/s eta 0:00:01[K     |████████                        | 20 kB 9.7 MB/s eta 0:00:01[K     |████████████                    | 30 kB 6.5 MB/s eta 0:00:01[K     |████████████████                | 40 kB 3.4 MB/s eta 0:00:01[K     |████████████████████            | 51 kB 3.9 MB/s eta 0:00:01[K     |████████████████████████        | 61 kB 4.1 MB/s eta 0:00:01[K     |████████████████████████████    | 71 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 81 kB 4.9 MB/s eta 0:00:01[K     |████████████████████████████████| 82 kB 389 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.3.0


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders.hashing import HashingEncoder
import numpy as np

# Pre-process/Transform predicotrs

In [5]:
df_pred_X = pd.read_csv("/content/df_pred_X.csv")

In [6]:
df_pred_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 41 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Chromosome                                 5040 non-null   object 
 1   Reference                                  5040 non-null   object 
 2   Alternate                                  5040 non-null   object 
 3   VEP Annotation                             5040 non-null   object 
 4   Allele Count                               5040 non-null   float64
 5   Allele Frequency                           5040 non-null   float64
 6   Homozygote Count                           5040 non-null   int64  
 7   Hemizygote Count                           5040 non-null   int64  
 8   Allele Count African/African-American      5040 non-null   int64  
 9   Allele Number African/African-American     5040 non-null   int64  
 10  Homozygote Count African

In [7]:
# Update data types
dict_astype = {"Chromosome":"category",  
               "Reference":"object",
               "Alternate":"object",
               "VEP Annotation":"category",\
               "Allele Count":"int32","Allele Frequency":"float64","Homozygote Count":"int32",\
               "Hemizygote Count":"int32","Allele Count African/African-American":"int32",\
               "Allele Number African/African-American":"int32",\
               "Homozygote Count African/African-American":"int32","Hemizygote Count African/African-American":"int32",\
               "Allele Count Latino/Admixed American":"int32","Allele Number Latino/Admixed American":"int32",\
               "Homozygote Count Latino/Admixed American":"int32",\
               "Hemizygote Count Latino/Admixed American":"int32","Allele Count Ashkenazi Jewish":"int32",\
               "Allele Number Ashkenazi Jewish":"int32","Homozygote Count Ashkenazi Jewish":"int32",\
               "Hemizygote Count Ashkenazi Jewish":"int32","Allele Count East Asian":"int32",\
               "Allele Number East Asian":"int32","Homozygote Count East Asian":"int32",\
               "Hemizygote Count East Asian":"int32","Allele Count European (Finnish)":"int32",\
               "Allele Number European (Finnish)":"int32","Homozygote Count European (Finnish)":"int32",\
               "Hemizygote Count European (Finnish)":"int32","Allele Count European (non-Finnish)":"int32",\
               "Allele Number European (non-Finnish)":"int32","Homozygote Count European (non-Finnish)":"int32",\
               "Hemizygote Count European (non-Finnish)":"int32","Allele Count Other":"int32",\
               "Allele Number Other":"int32","Homozygote Count Other":"int32","Hemizygote Count Other":"int32",\
               "Allele Count South Asian":"int32","Allele Number South Asian":"int32",\
               "Homozygote Count South Asian":"int32","Hemizygote Count South Asian":"int32",\
               "oe_lof_upper":"float64"}
df_pred_X = df_pred_X.astype(dict_astype)
df_pred_X.info()               

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 41 columns):
 #   Column                                     Non-Null Count  Dtype   
---  ------                                     --------------  -----   
 0   Chromosome                                 5040 non-null   category
 1   Reference                                  5040 non-null   object  
 2   Alternate                                  5040 non-null   object  
 3   VEP Annotation                             5040 non-null   category
 4   Allele Count                               5040 non-null   int32   
 5   Allele Frequency                           5040 non-null   float64 
 6   Homozygote Count                           5040 non-null   int32   
 7   Hemizygote Count                           5040 non-null   int32   
 8   Allele Count African/African-American      5040 non-null   int32   
 9   Allele Number African/African-American     5040 non-null   int32   
 10  Homozygote C

In [10]:
# Convert VEP annotation to numeric
tup_vep = df_pred_X["VEP Annotation"].factorize()
list_vep = tup_vep[0]
df_pred_X["VEP Annotation"] = list_vep
df_pred_X["VEP Annotation"] 

0       0
1       1
2       1
3       2
4       2
       ..
5035    5
5036    5
5037    5
5038    5
5039    5
Name: VEP Annotation, Length: 5040, dtype: int64

In [11]:
# Convert Chromosome to numeric
tup_chrom = df_pred_X["Chromosome"].factorize()
df_pred_X["Chromosome"] = tup_chrom[0]
df_pred_X["Chromosome"].value_counts()

2     513
10    477
8     355
4     336
20    306
3     296
17    294
19    270
16    231
1     230
11    210
7     207
0     187
12    168
5     166
18    158
9     147
15    109
6      92
21     90
13     82
14     75
22     41
Name: Chromosome, dtype: int64

In [15]:
hashEnc = HashingEncoder(cols=["Alternate","Reference"])
hashEnc.fit(df_pred_X)
df_pred_X_transformed = hashEnc.transform(df_pred_X)
df_pred_X_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5040 entries, 0 to 5039
Data columns (total 47 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   col_0                                      5040 non-null   int64  
 1   col_1                                      5040 non-null   int64  
 2   col_2                                      5040 non-null   int64  
 3   col_3                                      5040 non-null   int64  
 4   col_4                                      5040 non-null   int64  
 5   col_5                                      5040 non-null   int64  
 6   col_6                                      5040 non-null   int64  
 7   col_7                                      5040 non-null   int64  
 8   Chromosome                                 5040 non-null   int64  
 9   VEP Annotation                             5040 non-null   int64  
 10  Allele Count            

In [23]:
# Save processed data
df_pred_X_transformed.to_csv("/content/prediction-attributes.csv.csv", index=False)

# Pre-process/Transform labels (optional)

In [16]:
df_pred_y = pd.read_csv("/content/df_pred_y.csv")

Merge Pathogenic/Likely pathogenic and Likely pathogenic into Likely pathogenic.

In [17]:
df_pred_y.loc[df_pred_y['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic','ClinVar Clinical Significance'] = 'Likely pathogenic'

In [20]:
df_pred_y['ClinVar Clinical Significance'].unique()

array(['Likely pathogenic', 'Pathogenic', 'Benign'], dtype=object)

In [22]:
# Convert Clinical Significance to numeric
tup_y = df_pred_y["ClinVar Clinical Significance"].factorize()
df_pred_y["ClinVar Clinical Significance"] = tup_y[0]
df_pred_y['ClinVar Clinical Significance'].unique()

array([0, 1, 2])

In [24]:
# Save processed data
df_pred_y.to_csv("/content/true-labels.csv", index=False)